diff options
Diffstat (limited to 'media/libvpx/libvpx/vpx_dsp/loongarch')
28 files changed, 13654 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c new file mode 100644 index 0000000000..750c9de29f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h" + +void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride, + tran_low_t *dst) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + ptrdiff_t src_stride2 = src_stride << 1; + ptrdiff_t src_stride3 = src_stride2 + src_stride; + ptrdiff_t src_stride4 = src_stride2 << 1; + ptrdiff_t src_stride6 = src_stride3 << 1; + + int16_t *src_tmp = (int16_t *)src; + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2); + src3 = __lsx_vldx(src_tmp, src_stride6); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6); + src7 = __lsx_vldx(src_tmp, src_stride6); + + LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, + tmp4, tmp6, tmp7, tmp5, tmp3, tmp1); + LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, + src4, src5, src7, src6, src3, src2); + LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, + tmp3, tmp4, tmp5, tmp1, tmp6, tmp2); + LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, + tmp4, tmp6, tmp7, tmp5, tmp3, tmp1); + LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, + src4, src5, src7, src6, src3, src2); + LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, + tmp3, tmp4, tmp5, tmp1, tmp6, tmp2); + store_tran_low(tmp0, dst, 0); + store_tran_low(tmp1, dst, 8); + store_tran_low(tmp2, dst, 16); + store_tran_low(tmp3, dst, 24); + store_tran_low(tmp4, dst, 32); + store_tran_low(tmp5, dst, 40); + store_tran_low(tmp6, dst, 48); + store_tran_low(tmp7, dst, 56); +} + +void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride, + tran_low_t *dst) { + int i; + __m128i a0, a1, a2, a3, b0, b1, b2, b3; + + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0); + /* Top right. */ + vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64); + /* Bottom left. */ + vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128); + /* Bottom right. */ + vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192); + + for (i = 0; i < 64; i += 8) { + a0 = load_tran_low(dst); + a1 = load_tran_low(dst + 64); + a2 = load_tran_low(dst + 128); + a3 = load_tran_low(dst + 192); + + LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1); + DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3); + LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2); + + store_tran_low(a0, dst, 0); + store_tran_low(a1, dst, 64); + store_tran_low(a2, dst, 128); + store_tran_low(a3, dst, 192); + + dst += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c new file mode 100644 index 0000000000..482626080a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_util/loongson_intrinsics.h" + +void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + // width > 8 || width == 8 || width == 4 + if (width > 8) { + int i, j; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + __m128i p, r, avg; + + p = __lsx_vld(pred + j, 0); + r = __lsx_vld(ref + j, 0); + avg = __lsx_vavgr_bu(p, r); + __lsx_vst(avg, comp_pred + j, 0); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + int i = height * width; + do { + __m128i p, r, r_0, r_1; + + p = __lsx_vld(pred, 0); + r_0 = __lsx_vld(ref, 0); + ref += ref_stride; + r_1 = __lsx_vld(ref, 0); + ref += ref_stride; + r = __lsx_vilvl_d(r_1, r_0); + r = __lsx_vavgr_bu(p, r); + + __lsx_vst(r, comp_pred, 0); + + pred += 16; + comp_pred += 16; + i -= 16; + } while (i); + } else { // width = 4 + int i = height * width; + assert(width == 4); + do { + __m128i p, r, r_0, r_1, r_2, r_3; + p = __lsx_vld(pred, 0); + + if (width == ref_stride) { + r = __lsx_vld(ref, 0); + ref += 16; + } else { + r_0 = __lsx_vld(ref, 0); + ref += ref_stride; + r_1 = __lsx_vld(ref, 0); + ref += ref_stride; + r_2 = __lsx_vld(ref, 0); + ref += ref_stride; + r_3 = __lsx_vld(ref, 0); + ref += ref_stride; + DUP2_ARG2(__lsx_vilvl_w, r_1, r_0, r_3, r_2, r_0, r_2); + r = __lsx_vilvl_d(r_2, r_0); + } + r = __lsx_vavgr_bu(p, r); + + __lsx_vst(r, comp_pred, 0); + comp_pred += 16; + pred += 16; + i -= 16; + } while (i); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h new file mode 100644 index 0000000000..b0db1e99c5 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i load_tran_low(const tran_low_t *s) { +#if CONFIG_VP9_HIGHBITDEPTH + __m128i v0_m = __lsx_vld(s, 0); + __m128i v1_m = __lsx_vld(s + 4, 0); + return __lsx_vsrlni_h_w(v0_m, v1_m, 0); +#else + return __lsx_vld(s, 0); +#endif +} + +static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) { +#if CONFIG_VP9_HIGHBITDEPTH + __m128i v0_m, v1_m; + v1_m = __lsx_vexth_w_h(v); + v0_m = __lsx_vsllwil_w_h(v, 0); + __lsx_vst(v0_m, s + c, 0); + __lsx_vst(v1_m, s + c + 4, 0); +#else + __lsx_vst(v, s + c, 0); +#endif +} + +#endif // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c new file mode 100644 index 0000000000..9bb3877212 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c @@ -0,0 +1,1176 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" +#include "vpx_dsp/fwd_txfm.h" + +#define UNPCK_SH_SW(in, out0, out1) \ + do { \ + out0 = __lsx_vsllwil_w_h(in, 0); \ + out1 = __lsx_vexth_w_h(in); \ + } while (0) + +static void fdct8x32_1d_column_load_butterfly(const int16_t *input, + int32_t src_stride, + int16_t *temp_buff) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i step0, step1, step2, step3; + __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + __m128i step0_1, step1_1, step2_1, step3_1; + + int32_t stride = src_stride << 1; + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + const int16_t *input_tmp = (int16_t *)input; + + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2); + in3 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in0_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1); + in3_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp = input + (src_stride * 24); + in4_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1); + in7_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6); + in7 = __lsx_vldx(input_tmp, stride3); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1, + in2_1, in3_1); + DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1, + in6_1, in7_1); + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, + in7_1); + + __lsx_vst(step0, temp_buff, 0); + __lsx_vst(step1, temp_buff, 16); + __lsx_vst(step2, temp_buff, 32); + __lsx_vst(step3, temp_buff, 48); + + __lsx_vst(in4, temp_buff, 448); + __lsx_vst(in5, temp_buff, 464); + __lsx_vst(in6, temp_buff, 480); + __lsx_vst(in7, temp_buff, 496); + + __lsx_vst(step0_1, temp_buff, 64); + __lsx_vst(step1_1, temp_buff, 80); + __lsx_vst(step2_1, temp_buff, 96); + __lsx_vst(step3_1, temp_buff, 112); + + __lsx_vst(in4_1, temp_buff, 384); + __lsx_vst(in5_1, temp_buff, 400); + __lsx_vst(in6_1, temp_buff, 416); + __lsx_vst(in7_1, temp_buff, 432); + + /* 3rd and 4th set */ + input_tmp = input + (src_stride * 8); + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2); + in3 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in0_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1); + in3_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1); + in7_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6); + in7 = __lsx_vldx(input_tmp, stride3); + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1, + in2_1, in3_1); + DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1, + in6_1, in7_1); + + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, + in7_1); + + __lsx_vst(step0, temp_buff, 128); + __lsx_vst(step1, temp_buff, 144); + __lsx_vst(step2, temp_buff, 160); + __lsx_vst(step3, temp_buff, 176); + + __lsx_vst(in4, temp_buff, 320); + __lsx_vst(in5, temp_buff, 336); + __lsx_vst(in6, temp_buff, 352); + __lsx_vst(in7, temp_buff, 368); + + __lsx_vst(step0_1, temp_buff, 192); + __lsx_vst(step1_1, temp_buff, 208); + __lsx_vst(step2_1, temp_buff, 224); + __lsx_vst(step3_1, temp_buff, 240); + + __lsx_vst(in4_1, temp_buff, 256); + __lsx_vst(in5_1, temp_buff, 272); + __lsx_vst(in6_1, temp_buff, 288); + __lsx_vst(in7_1, temp_buff, 304); +} + +static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i temp0, temp1; + + /* fdct even */ + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12, + in13, in14, in15); + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, + vec2, vec3, in12, in13, in14, in15); + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9, + in10, in11); + LSX_BUTTERFLY_8_H(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, + vec7, in8, in9, in10, in11); + + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 0); + __lsx_vst(temp1, temp, 1024); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 512); + __lsx_vst(temp1, temp, 1536); + + DUP4_ARG2(__lsx_vsub_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, + vec6, vec5, vec4); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 256); + __lsx_vst(temp1, temp, 1792); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 1280); + __lsx_vst(temp1, temp, 768); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 128); + __lsx_vst(temp1, temp, 1920); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 1152); + __lsx_vst(temp1, temp, 896); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 640); + __lsx_vst(temp1, temp, 1408); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 384); + __lsx_vst(temp1, temp, 1664); +} + +static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + __m128i tmp0, tmp1; + + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 160, input, 176, in20, in21, + in26, in27); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 192, input, 208, in18, in19, + in28, in29); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, input, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, input, 80); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, input, 160); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, input, 176); + + in21 = __lsx_vadd_h(in18, in21); + in20 = __lsx_vadd_h(in19, in20); + in27 = __lsx_vadd_h(in28, in27); + in26 = __lsx_vadd_h(in29, in26); + + DUP4_ARG2(__lsx_vld, input, 96, input, 112, input, 128, input, 144, in22, + in23, in24, in25); + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 224, input, 240, in16, in17, + in30, in31); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, input, 32); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, input, 48); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, input, 192); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, input, 208); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 0); + __lsx_vst(vec4, temp_ptr, 1920); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 896); + __lsx_vst(vec4, temp_ptr, 1024); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec4, temp_ptr, 1408); + __lsx_vst(vec5, temp_ptr, 512); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec4, temp_ptr, 384); + __lsx_vst(vec5, temp_ptr, 1536); + + DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 64, input, 80, in22, in23, + in20, in21); + DUP4_ARG2(__lsx_vld, input, 160, input, 176, input, 192, input, 208, in26, + in27, in24, in25); + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 1664); + __lsx_vst(vec4, temp_ptr, 256); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 640); + __lsx_vst(vec4, temp_ptr, 1280); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 1152); + __lsx_vst(vec4, temp_ptr, 768); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 128); + __lsx_vst(vec4, temp_ptr, 1792); +} + +static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, + int16_t *tmp_buf, int16_t *tmp_buf_big) { + fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); + fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); + fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); +} + +static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, + int16_t *output) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i step0, step1, step2, step3, step4, step5, step6, step7; + + DUP4_ARG2(__lsx_vld, temp_buff, 0, temp_buff, 64, temp_buff, 128, temp_buff, + 192, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, temp_buff, 256, temp_buff, 320, temp_buff, 384, + temp_buff, 448, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vld, temp_buff, 48, temp_buff, 112, temp_buff, 176, temp_buff, + 240, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, temp_buff, 304, temp_buff, 368, temp_buff, 432, + temp_buff, 496, in12, in13, in14, in15); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, step0, step1, step2, step3, + step4, step5, step6, step7, in8, in9, in10, in11, in12, + in13, in14, in15); + + __lsx_vst(step0, output, 0); + __lsx_vst(step1, output, 16); + __lsx_vst(step2, output, 32); + __lsx_vst(step3, output, 48); + __lsx_vst(step4, output, 64); + __lsx_vst(step5, output, 80); + __lsx_vst(step6, output, 96); + __lsx_vst(step7, output, 112); + + __lsx_vst(in8, output, 384); + __lsx_vst(in9, output, 400); + __lsx_vst(in10, output, 416); + __lsx_vst(in11, output, 432); + __lsx_vst(in12, output, 448); + __lsx_vst(in13, output, 464); + __lsx_vst(in14, output, 480); + __lsx_vst(in15, output, 496); + + /* 2nd set */ + DUP4_ARG2(__lsx_vld, temp_buff, 16, temp_buff, 80, temp_buff, 144, temp_buff, + 208, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, temp_buff, 272, temp_buff, 336, temp_buff, 400, + temp_buff, 464, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vld, temp_buff, 32, temp_buff, 96, temp_buff, 160, temp_buff, + 224, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, temp_buff, 288, temp_buff, 352, temp_buff, 416, + temp_buff, 480, in12, in13, in14, in15); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, step0, step1, step2, step3, + step4, step5, step6, step7, in8, in9, in10, in11, in12, + in13, in14, in15); + + __lsx_vst(step0, output, 128); + __lsx_vst(step1, output, 144); + __lsx_vst(step2, output, 160); + __lsx_vst(step3, output, 176); + __lsx_vst(step4, output, 192); + __lsx_vst(step5, output, 208); + __lsx_vst(step6, output, 224); + __lsx_vst(step7, output, 240); + + __lsx_vst(in8, output, 256); + __lsx_vst(in9, output, 272); + __lsx_vst(in10, output, 288); + __lsx_vst(in11, output, 304); + __lsx_vst(in12, output, 320); + __lsx_vst(in13, output, 336); + __lsx_vst(in14, output, 352); + __lsx_vst(in15, output, 368); +} + +static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, + int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; + __m128i vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; + __m128i tmp0_w, tmp1_w, tmp2_w, tmp3_w; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12, + in13, in14, in15); + + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + + __lsx_vst(vec0, interm_ptr, 0); + __lsx_vst(vec1, interm_ptr, 16); + __lsx_vst(vec2, interm_ptr, 32); + __lsx_vst(vec3, interm_ptr, 48); + __lsx_vst(vec4, interm_ptr, 64); + __lsx_vst(vec5, interm_ptr, 80); + __lsx_vst(vec6, interm_ptr, 96); + __lsx_vst(vec7, interm_ptr, 112); + + __lsx_vst(in8, interm_ptr, 128); + __lsx_vst(in9, interm_ptr, 144); + __lsx_vst(in10, interm_ptr, 160); + __lsx_vst(in11, interm_ptr, 176); + __lsx_vst(in12, interm_ptr, 192); + __lsx_vst(in13, interm_ptr, 208); + __lsx_vst(in14, interm_ptr, 224); + __lsx_vst(in15, interm_ptr, 240); + + /* Stage 3 */ + UNPCK_SH_SW(vec0, vec0_l, vec0_r); + UNPCK_SH_SW(vec1, vec1_l, vec1_r); + UNPCK_SH_SW(vec2, vec2_l, vec2_r); + UNPCK_SH_SW(vec3, vec3_l, vec3_r); + UNPCK_SH_SW(vec4, vec4_l, vec4_r); + UNPCK_SH_SW(vec5, vec5_l, vec5_r); + UNPCK_SH_SW(vec6, vec6_l, vec6_r); + UNPCK_SH_SW(vec7, vec7_l, vec7_r); + DUP4_ARG2(__lsx_vadd_w, vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, + vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w); + LSX_BUTTERFLY_4_W(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, + vec5_r); + DUP4_ARG2(__lsx_vadd_w, vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, + vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r); + + tmp3_w = __lsx_vadd_w(vec0_r, vec3_r); + vec0_r = __lsx_vsub_w(vec0_r, vec3_r); + vec3_r = __lsx_vadd_w(vec1_r, vec2_r); + vec1_r = __lsx_vsub_w(vec1_r, vec2_r); + + DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 16); + + DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + __lsx_vst(vec5, out, 32); + __lsx_vst(vec4, out, 48); + + DUP4_ARG2(__lsx_vld, interm_ptr, 0, interm_ptr, 16, interm_ptr, 32, + interm_ptr, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, interm_ptr, 64, interm_ptr, 80, interm_ptr, 96, + interm_ptr, 112, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 64); + __lsx_vst(in5, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 80); + __lsx_vst(in5, out, 96); + + DUP4_ARG2(__lsx_vld, interm_ptr, 128, interm_ptr, 144, interm_ptr, 160, + interm_ptr, 176, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, interm_ptr, 192, interm_ptr, 208, interm_ptr, 224, + interm_ptr, 240, in12, in13, in14, in15); + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 128); + __lsx_vst(in5, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 144); + __lsx_vst(in5, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + tmp0_w = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(tmp0_w, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 160); + __lsx_vst(in5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 192); + __lsx_vst(in5, out, 176); +} + +static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6, + in7); + DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13, + in14, in15); + + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 0); + __lsx_vst(temp1, out, 16); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 32); + __lsx_vst(temp1, out, 48); + + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 64); + __lsx_vst(temp1, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 80); + __lsx_vst(temp1, out, 96); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 128); + __lsx_vst(temp1, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 144); + __lsx_vst(temp1, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5) + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 160); + __lsx_vst(temp1, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 192); + __lsx_vst(temp1, out, 176); +} + +static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + __m128i tmp0, tmp1; + + in20 = __lsx_vld(temp, 64); + in21 = __lsx_vld(temp, 80); + in26 = __lsx_vld(temp, 160); + in27 = __lsx_vld(temp, 176); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = __lsx_vld(temp, 32); + in19 = __lsx_vld(temp, 48); + in28 = __lsx_vld(temp, 192); + in29 = __lsx_vld(temp, 208); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, interm_ptr, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, interm_ptr, 176); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, interm_ptr, 112); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, interm_ptr, 128); + + DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21, + in20, in27, in26); + + in22 = __lsx_vld(temp, 96); + in23 = __lsx_vld(temp, 112); + in24 = __lsx_vld(temp, 128); + in25 = __lsx_vld(temp, 144); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = __lsx_vld(temp, 0); + in17 = __lsx_vld(temp, 16); + in30 = __lsx_vld(temp, 224); + in31 = __lsx_vld(temp, 240); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, interm_ptr, 80); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, interm_ptr, 96); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, interm_ptr, 144); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, interm_ptr, 160); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 224); + __lsx_vst(vec4, out, 16); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 32); + __lsx_vst(vec5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 48); + __lsx_vst(vec5, out, 192); + + in20 = __lsx_vld(interm_ptr, 64); + in21 = __lsx_vld(interm_ptr, 176); + in27 = __lsx_vld(interm_ptr, 112); + in26 = __lsx_vld(interm_ptr, 128); + + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = __lsx_vld(interm_ptr, 80); + in25 = __lsx_vld(interm_ptr, 96); + in24 = __lsx_vld(interm_ptr, 144); + in23 = __lsx_vld(interm_ptr, 160); + + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 64); + __lsx_vst(vec4, out, 176); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 80); + __lsx_vst(vec4, out, 160); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 144); + __lsx_vst(vec4, out, 96); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 112); + __lsx_vst(vec5, out, 128); +} + +static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + + /* 1st set */ + in0 = __lsx_vld(temp, 0); + in4 = __lsx_vld(temp, 64); + in2 = __lsx_vld(temp, 128); + in6 = __lsx_vld(temp, 192); + in1 = __lsx_vld(temp, 256); + in7 = __lsx_vld(temp, 304); + in3 = __lsx_vld(temp, 384); + in5 = __lsx_vld(temp, 432); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* 2nd set */ + in0_1 = __lsx_vld(temp, 32); + in1_1 = __lsx_vld(temp, 464); + in2_1 = __lsx_vld(temp, 160); + in3_1 = __lsx_vld(temp, 336); + in4_1 = __lsx_vld(temp, 96); + in5_1 = __lsx_vld(temp, 352); + in6_1 = __lsx_vld(temp, 224); + in7_1 = __lsx_vld(temp, 480); + + __lsx_vst(in0, output, 0); + __lsx_vst(in1, output, 64); + __lsx_vst(in2, output, 128); + __lsx_vst(in3, output, 192); + __lsx_vst(in4, output, 256); + __lsx_vst(in5, output, 320); + __lsx_vst(in6, output, 384); + __lsx_vst(in7, output, 448); + + LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + /* 3rd set */ + in0 = __lsx_vld(temp, 16); + in1 = __lsx_vld(temp, 272); + in2 = __lsx_vld(temp, 144); + in3 = __lsx_vld(temp, 400); + in4 = __lsx_vld(temp, 80); + in5 = __lsx_vld(temp, 416); + in6 = __lsx_vld(temp, 208); + in7 = __lsx_vld(temp, 288); + + __lsx_vst(in0_1, output, 16); + __lsx_vst(in1_1, output, 80); + __lsx_vst(in2_1, output, 144); + __lsx_vst(in3_1, output, 208); + __lsx_vst(in4_1, output, 272); + __lsx_vst(in5_1, output, 336); + __lsx_vst(in6_1, output, 400); + __lsx_vst(in7_1, output, 464); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + __lsx_vst(in0, output, 32); + __lsx_vst(in1, output, 96); + __lsx_vst(in2, output, 160); + __lsx_vst(in3, output, 224); + __lsx_vst(in4, output, 288); + __lsx_vst(in5, output, 352); + __lsx_vst(in6, output, 416); + __lsx_vst(in7, output, 480); + + /* 4th set */ + in0_1 = __lsx_vld(temp, 48); + in1_1 = __lsx_vld(temp, 448); + in2_1 = __lsx_vld(temp, 176); + in3_1 = __lsx_vld(temp, 320); + in4_1 = __lsx_vld(temp, 112); + in5_1 = __lsx_vld(temp, 368); + in6_1 = __lsx_vld(temp, 240); + in7_1 = __lsx_vld(temp, 496); + + LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + __lsx_vst(in0_1, output, 48); + __lsx_vst(in1_1, output, 112); + __lsx_vst(in2_1, output, 176); + __lsx_vst(in3_1, output, 240); + __lsx_vst(in4_1, output, 304); + __lsx_vst(in5_1, output, 368); + __lsx_vst(in6_1, output, 432); + __lsx_vst(in7_1, output, 496); +} + +static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { + fdct8x32_1d_row_load_butterfly(temp, temp_buf); + fdct8x32_1d_row_even(temp_buf, temp_buf); + fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); + fdct8x32_1d_row_transpose_store(temp_buf, output); +} + +static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); + fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vpx_fdct32x32_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + int i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, + tmp_buf_big + (8 * i)); + } + + /* row transform */ + fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); + + /* row transform */ + for (i = 1; i < 4; ++i) { + fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); + } +} + +static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6, + in7); + DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13, + in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + + FDCT_POSTPROC_2V_NEG_H(vec0, vec1); + FDCT_POSTPROC_2V_NEG_H(vec2, vec3); + FDCT_POSTPROC_2V_NEG_H(vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec6, vec7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + + temp0 = __lsx_vadd_h(in0, in3); + in0 = __lsx_vsub_h(in0, in3); + in3 = __lsx_vadd_h(in1, in2); + in1 = __lsx_vsub_h(in1, in2); + + DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); + __lsx_vst(temp0, out, 0); + __lsx_vst(temp1, out, 16); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + __lsx_vst(temp0, out, 32); + __lsx_vst(temp1, out, 48); + + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + __lsx_vst(temp0, out, 64); + __lsx_vst(temp1, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + __lsx_vst(temp0, out, 80); + __lsx_vst(temp1, out, 96); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + __lsx_vst(temp0, out, 128); + __lsx_vst(temp1, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + __lsx_vst(temp0, out, 144); + __lsx_vst(temp1, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + __lsx_vst(temp0, out, 160); + __lsx_vst(temp1, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + __lsx_vst(temp0, out, 192); + __lsx_vst(temp1, out, 176); +} + +static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31; + __m128i vec4, vec5, tmp0, tmp1; + + in20 = __lsx_vld(temp, 64); + in21 = __lsx_vld(temp, 80); + in26 = __lsx_vld(temp, 160); + in27 = __lsx_vld(temp, 176); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + FDCT_POSTPROC_2V_NEG_H(in20, in21); + FDCT_POSTPROC_2V_NEG_H(in26, in27); + + in18 = __lsx_vld(temp, 32); + in19 = __lsx_vld(temp, 48); + in28 = __lsx_vld(temp, 192); + in29 = __lsx_vld(temp, 208); + + FDCT_POSTPROC_2V_NEG_H(in18, in19); + FDCT_POSTPROC_2V_NEG_H(in28, in29); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, interm_ptr, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, interm_ptr, 176); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, interm_ptr, 128); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, interm_ptr, 112); + + DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21, + in20, in27, in26); + + in22 = __lsx_vld(temp, 96); + in23 = __lsx_vld(temp, 112); + in24 = __lsx_vld(temp, 128); + in25 = __lsx_vld(temp, 144); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + FDCT_POSTPROC_2V_NEG_H(in22, in23); + FDCT_POSTPROC_2V_NEG_H(in24, in25); + + in16 = __lsx_vld(temp, 0); + in17 = __lsx_vld(temp, 16); + in30 = __lsx_vld(temp, 224); + in31 = __lsx_vld(temp, 240); + + FDCT_POSTPROC_2V_NEG_H(in16, in17); + FDCT_POSTPROC_2V_NEG_H(in30, in31); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, interm_ptr, 80); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, interm_ptr, 96); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, interm_ptr, 144); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, interm_ptr, 160); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + __lsx_vst(vec5, out, 224); + __lsx_vst(vec4, out, 16); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + __lsx_vst(vec4, out, 32); + __lsx_vst(vec5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + __lsx_vst(vec4, out, 48); + __lsx_vst(vec5, out, 192); + + in20 = __lsx_vld(interm_ptr, 64); + in21 = __lsx_vld(interm_ptr, 176); + in27 = __lsx_vld(interm_ptr, 112); + in26 = __lsx_vld(interm_ptr, 128); + + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = __lsx_vld(interm_ptr, 80); + in25 = __lsx_vld(interm_ptr, 96); + in24 = __lsx_vld(interm_ptr, 144); + in23 = __lsx_vld(interm_ptr, 160); + + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + in16 = __lsx_vadd_h(in28, in29); + in19 = __lsx_vadd_h(in31, in30); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + __lsx_vst(vec5, out, 64); + __lsx_vst(vec4, out, 176); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + __lsx_vst(vec5, out, 80); + __lsx_vst(vec4, out, 160); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + __lsx_vst(vec5, out, 144); + __lsx_vst(vec4, out, 96); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + __lsx_vst(vec4, out, 112); + __lsx_vst(vec5, out, 128); +} + +static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); + fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vpx_fdct32x32_rd_lsx(const int16_t *input, int16_t *out, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], + &tmp_buf_big[0] + (8 * i)); + } + /* row transform */ + for (i = 0; i < 4; ++i) { + fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], + out + (8 * i * 32)); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c new file mode 100644 index 0000000000..508532b9d8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" + +#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + do { \ + __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ + \ + DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ + DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _t2 = __lsx_vilvl_h(_s3, _s2); \ + _t3 = __lsx_vilvh_h(_s3, _s2); \ + DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ + DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ + } while (0) + +#if !CONFIG_VP9_HIGHBITDEPTH +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride) { + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30; + __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; + __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; + __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; + __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + int32_t src_stride8 = src_stride4 << 1; + int16_t *input_tmp = (int16_t *)input; + in0 = __lsx_vld(input_tmp, 0); + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4); + input_tmp += src_stride4; + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8); + input_tmp += src_stride4; + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11, + in12); + input_tmp += src_stride4; + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13, + in14); + input_tmp += src_stride2; + in15 = __lsx_vldx(input_tmp, src_stride2); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14, + in15); + DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0, + tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, + tmp6, tmp7); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + __lsx_vst(tmp0, tmp_ptr, 0); + __lsx_vst(tmp1, tmp_ptr, 64); + __lsx_vst(tmp2, tmp_ptr, 128); + __lsx_vst(tmp3, tmp_ptr, 192); + __lsx_vst(tmp4, tmp_ptr, 256); + __lsx_vst(tmp5, tmp_ptr, 320); + __lsx_vst(tmp6, tmp_ptr, 384); + __lsx_vst(tmp7, tmp_ptr, 448); + DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15, + in14, in13, in12); + DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, + in9, in8); + + tmp_ptr += 16; + + /* stp 1 */ + DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4); + DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5); + + cnst4 = __lsx_vreplvei_h(coeff, 0); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25); + + cnst5 = __lsx_vreplvei_h(coeff, 1); + cnst5 = __lsx_vpackev_h(cnst5, cnst4); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23); + + /* stp2 */ + LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); + LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); + DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4); + DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5); + DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26); + + cnst0 = __lsx_vreplvei_h(coeff, 4); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21); + + LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9); + vec1 = __lsx_vilvl_h(in15, in8); + vec0 = __lsx_vilvh_h(in15, in8); + + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 0); + + cnst0 = __lsx_vreplvei_h(coeff2, 0); + cnst0 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 448); + + vec1 = __lsx_vilvl_h(in14, in9); + vec0 = __lsx_vilvh_h(in14, in9); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8); + __lsx_vst(in8, tmp_ptr, 256); + + cnst1 = __lsx_vreplvei_h(coeff2, 2); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 192); + + DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25); + + cnst1 = __lsx_vreplvei_h(coeff, 3); + cnst1 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22); + + /* stp4 */ + DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10); + + vec1 = __lsx_vilvl_h(in13, in10); + vec0 = __lsx_vilvh_h(in13, in10); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 128); + + cnst0 = __lsx_vreplvei_h(coeff2, 1); + cnst0 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 320); + + DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11); + vec1 = __lsx_vilvl_h(in12, in11); + vec0 = __lsx_vilvh_h(in12, in11); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8); + __lsx_vst(in8, tmp_ptr, 384); + + cnst1 = __lsx_vreplvei_h(coeff2, 3); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 64); +} + +void fdct16x8_1d_row(int16_t *input, int16_t *output) { + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + int16_t *input_tmp = input; + + DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp, + 112, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208, + input_tmp, 240, in12, in13, in14, in15); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, + in14, in15); + + DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14, + in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, + tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, + in15); + __lsx_vst(in8, input, 0); + __lsx_vst(in9, input, 32); + __lsx_vst(in10, input, 64); + __lsx_vst(in11, input, 96); + __lsx_vst(in12, input, 128); + __lsx_vst(in13, input, 160); + __lsx_vst(in14, input, 192); + __lsx_vst(in15, input, 224); + + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12, + in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); + __lsx_vst(tmp0, output, 0); + __lsx_vst(in0, output, 32); + __lsx_vst(tmp1, output, 64); + __lsx_vst(in1, output, 96); + __lsx_vst(tmp2, output, 128); + __lsx_vst(in2, output, 160); + __lsx_vst(tmp3, output, 192); + __lsx_vst(in3, output, 224); + + LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); + __lsx_vst(tmp4, output, 16); + __lsx_vst(in4, output, 48); + __lsx_vst(tmp5, output, 80); + __lsx_vst(in5, output, 112); + __lsx_vst(tmp6, output, 144); + __lsx_vst(in6, output, 176); + __lsx_vst(tmp7, output, 208); + __lsx_vst(in7, output, 240); +} + +void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + __m128i in0, in1, in2, in3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2); + in3 = __lsx_vldx(input, src_stride6); + + /* fdct4 pre-process */ + { + __m128i vec, mask; + __m128i zero = __lsx_vldi(0); + + mask = __lsx_vinsgr2vr_b(zero, 1, 0); + DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2, + in3); + vec = __lsx_vseqi_h(in0, 0); + vec = __lsx_vxori_b(vec, 255); + vec = __lsx_vand_v(mask, vec); + in0 = __lsx_vadd_h(in0, vec); + } + + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2); + __lsx_vst(in0, output, 0); + __lsx_vst(in2, output, 16); +} + +void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + int16_t *input_tmp = (int16_t *)input; + + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1, + in2); + in3 = __lsx_vldx(input_tmp, src_stride6); + input_tmp += src_stride4; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5, + in6); + in7 = __lsx_vldx(input_tmp, src_stride6); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + + __lsx_vst(in0, output, 0); + __lsx_vst(in1, output, 16); + __lsx_vst(in2, output, 32); + __lsx_vst(in3, output, 48); + __lsx_vst(in4, output, 64); + __lsx_vst(in5, output, 80); + __lsx_vst(in6, output, 96); + __lsx_vst(in7, output, 112); +} + +void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); + + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); + } +} +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h new file mode 100644 index 0000000000..4a9fce9a3d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ + +#include "vpx_dsp/loongarch/txfm_macros_lsx.h" +#include "vpx_dsp/txfm_common.h" + +#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ + do { \ + __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ + __m128i vec4_m, vec5_m, vec6_m, vec7_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \ + \ + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ + DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \ + cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \ + vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ + \ + vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \ + cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \ + cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \ + vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ + \ + DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \ + vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \ + vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \ + } while (0) + +#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ + __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ + \ + /* FDCT stage1 */ \ + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \ + s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ + LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \ + x2_m = __lsx_vneg_h(x2_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \ + x2_m = __lsx_vreplvei_h(coeff_m, 2); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \ + \ + /* stage2 */ \ + s1_m = __lsx_vilvl_h(s5_m, s6_m); \ + s0_m = __lsx_vilvh_h(s5_m, s6_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \ + \ + /* stage3 */ \ + LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x0_m, x1_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \ + \ + x1_m = __lsx_vreplvei_h(coeff_m, 5); \ + x0_m = __lsx_vneg_h(x0_m); \ + x0_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \ + x2_m = __lsx_vreplvei_h(coeff_m, 6); \ + x3_m = __lsx_vneg_h(x3_m); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ + } while (0) + +#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ + do { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \ + vec1_m, vec2_m, vec3_m); \ + DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \ + vec5_m, vec6_m, vec7_m); \ + DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \ + in3, in0, in1, in2, in3); \ + DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \ + in7, in4, in5, in6, in7); \ + } while (0) + +#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ + do { \ + __m128i tp0_m, tp1_m; \ + __m128i one = __lsx_vreplgr2vr_h(1); \ + \ + tp0_m = __lsx_vslei_h(vec0, 0); \ + tp1_m = __lsx_vslei_h(vec1, 0); \ + tp0_m = __lsx_vxori_b(tp0_m, 255); \ + tp1_m = __lsx_vxori_b(tp1_m, 255); \ + vec0 = __lsx_vadd_h(vec0, one); \ + vec1 = __lsx_vadd_h(vec1, one); \ + tp0_m = __lsx_vand_v(one, tp0_m); \ + tp1_m = __lsx_vand_v(one, tp1_m); \ + vec0 = __lsx_vadd_h(vec0, tp0_m); \ + vec1 = __lsx_vadd_h(vec1, tp1_m); \ + vec0 = __lsx_vsrai_h(vec0, 2); \ + vec1 = __lsx_vsrai_h(vec1, 2); \ + } while (0) + +#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ + do { \ + __m128i tp0_m, tp1_m; \ + __m128i one_m = __lsx_vldi(0x401); \ + \ + tp0_m = __lsx_vslti_h(vec0, 0); \ + tp1_m = __lsx_vslti_h(vec1, 0); \ + vec0 = __lsx_vadd_h(vec0, one_m); \ + vec1 = __lsx_vadd_h(vec1, one_m); \ + tp0_m = __lsx_vand_v(one_m, tp0_m); \ + tp1_m = __lsx_vand_v(one_m, tp1_m); \ + vec0 = __lsx_vadd_h(vec0, tp0_m); \ + vec1 = __lsx_vadd_h(vec1, tp1_m); \ + vec0 = __lsx_vsrai_h(vec0, 2); \ + vec1 = __lsx_vsrai_h(vec1, 2); \ + } while (0) + +#define FDCT32_POSTPROC_NEG_W(vec) \ + do { \ + __m128i temp_m; \ + __m128i one_m = __lsx_vreplgr2vr_w(1); \ + \ + temp_m = __lsx_vslti_w(vec, 0); \ + vec = __lsx_vadd_w(vec, one_m); \ + temp_m = __lsx_vand_v(one_m, temp_m); \ + vec = __lsx_vadd_w(vec, temp_m); \ + vec = __lsx_vsrai_w(vec, 2); \ + } while (0) + +#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ + const0, const1, out0, out1, out2, out3) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1; \ + __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0); \ + \ + s0_m = __lsx_vreplgr2vr_w((int32_t)const1); \ + k0_m = __lsx_vpackev_w(s0_m, k0_m); \ + \ + DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1); \ + s1_m = __lsx_vilvl_w(_tmp0, reg0_left); \ + s0_m = __lsx_vilvh_w(_tmp0, reg0_left); \ + s3_m = __lsx_vilvl_w(reg0_left, reg1_left); \ + s2_m = __lsx_vilvh_w(reg0_left, reg1_left); \ + s5_m = __lsx_vilvl_w(_tmp1, reg0_right); \ + s4_m = __lsx_vilvh_w(_tmp1, reg0_right); \ + s7_m = __lsx_vilvl_w(reg0_right, reg1_right); \ + s6_m = __lsx_vilvh_w(reg0_right, reg1_right); \ + DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m); \ + DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m); \ + DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ + DCT_CONST_BITS, out0, out1); \ + DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m); \ + DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m); \ + DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ + DCT_CONST_BITS, out2, out3); \ + } while (0) + +#define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2, \ + in3) \ + do { \ + __m128i dst0_m, dst1_m, dst2_m, dst3_m; \ + __m128i tmp0_m, tmp1_m; \ + __m128i res0_m, res1_m, res2_m, res3_m; \ + \ + dst0_m = __lsx_vld(dst, 0); \ + DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m); \ + dst3_m = __lsx_vldx(dst, _stride3); \ + DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \ + res0_m, res1_m, res2_m, res3_m); \ + DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m, \ + in3, res0_m, res1_m, res2_m, res3_m); \ + DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0, \ + tmp0_m, tmp1_m); \ + __lsx_vstelm_d(tmp0_m, dst, 0, 0); \ + __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1); \ + __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0); \ + __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \ + } while (0) + +#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + __m128i x0_m, x1_m, x2_m, x3_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ + \ + /* FDCT stage1 */ \ + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \ + s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ + LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \ + x2_m = __lsx_vneg_h(x2_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \ + x2_m = __lsx_vreplvei_h(coeff_m, 2); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \ + \ + /* stage2 */ \ + s1_m = __lsx_vilvl_h(s5_m, s6_m); \ + s0_m = __lsx_vilvh_h(s5_m, s6_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \ + \ + /* stage3 */ \ + LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x0_m, x1_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \ + \ + x1_m = __lsx_vreplvei_h(coeff_m, 5); \ + x0_m = __lsx_vneg_h(x0_m); \ + x0_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \ + \ + x2_m = __lsx_vreplvei_h(coeff_m, 6); \ + x3_m = __lsx_vneg_h(x3_m); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ + } while (0) + +#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ + input7, out1, out3, out5, out7, out9, out11, out13, \ + out15) \ + do { \ + __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ + __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ + __m128i stp36_m, stp37_m, vec0_m, vec1_m; \ + __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ + __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; \ + __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; \ + __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 }; \ + \ + /* stp 1 */ \ + DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \ + DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \ + \ + cnst4_m = __lsx_vreplvei_h(coeff_m, 0); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m); \ + \ + cnst5_m = __lsx_vreplvei_h(coeff_m, 1); \ + cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m); \ + \ + /* stp2 */ \ + LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, \ + stp32_m, stp33_m); \ + LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, \ + stp35_m, stp34_m); \ + \ + DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, \ + vec4_m); \ + DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, \ + vec5_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff_m, 4); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff_m, 3); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m); \ + \ + /* stp4 */ \ + LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, \ + vec4_m, vec5_m); \ + LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, \ + stp24_m, stp31_m); \ + \ + vec1_m = __lsx_vilvl_h(vec2_m, vec6_m); \ + vec0_m = __lsx_vilvh_h(vec2_m, vec6_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff2_m, 0); \ + cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15); \ + \ + vec1_m = __lsx_vilvl_h(vec4_m, vec5_m); \ + vec0_m = __lsx_vilvh_h(vec4_m, vec5_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9); \ + \ + cnst1_m = __lsx_vreplvei_h(coeff2_m, 2); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7); \ + \ + vec1_m = __lsx_vilvl_h(stp23_m, stp21_m); \ + vec0_m = __lsx_vilvh_h(stp23_m, stp21_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff2_m, 1); \ + cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11); \ + \ + vec1_m = __lsx_vilvl_h(stp24_m, stp31_m); \ + vec0_m = __lsx_vilvh_h(stp24_m, stp31_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13); \ + \ + cnst1_m = __lsx_vreplvei_h(coeff2_m, 3); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3); \ + } while (0) + +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride); +void fdct16x8_1d_row(int16_t *input, int16_t *output); +#endif // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c new file mode 100644 index 0000000000..ec07f57d90 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c @@ -0,0 +1,834 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" + +#define UNPCK_UB_SH(_in, _out0, _out1) \ + do { \ + _out0 = __lsx_vsllwil_hu_bu(_in, 0); \ + _out1 = __lsx_vexth_hu_bu(_in); \ + } while (0) + +static void idct32x8_row_transpose_store(const int16_t *input, + int16_t *tmp_buf) { + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + + /* 1st & 2nd 8x8 */ + DUP4_ARG2(__lsx_vld, input, 0, input, 64, input, 128, input, 192, m0, n0, m1, + n1); + DUP4_ARG2(__lsx_vld, input, 256, input, 320, input, 384, input, 448, m2, n2, + m3, n3); + DUP4_ARG2(__lsx_vld, input, 16, input, 80, input, 144, input, 208, m4, n4, m5, + n5); + DUP4_ARG2(__lsx_vld, input, 272, input, 336, input, 400, input, 464, m6, n6, + m7, n7); + + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + + __lsx_vst(m0, tmp_buf, 0); + __lsx_vst(n0, tmp_buf, 16); + __lsx_vst(m1, tmp_buf, 32); + __lsx_vst(n1, tmp_buf, 48); + __lsx_vst(m2, tmp_buf, 64); + __lsx_vst(n2, tmp_buf, 80); + __lsx_vst(m3, tmp_buf, 96); + __lsx_vst(n3, tmp_buf, 112); + __lsx_vst(m4, tmp_buf, 128); + __lsx_vst(n4, tmp_buf, 144); + __lsx_vst(m5, tmp_buf, 160); + __lsx_vst(n5, tmp_buf, 176); + __lsx_vst(m6, tmp_buf, 192); + __lsx_vst(n6, tmp_buf, 208); + __lsx_vst(m7, tmp_buf, 224); + __lsx_vst(n7, tmp_buf, 240); + + /* 3rd & 4th 8x8 */ + DUP4_ARG2(__lsx_vld, input, 32, input, 96, input, 160, input, 224, m0, n0, m1, + n1); + DUP4_ARG2(__lsx_vld, input, 288, input, 352, input, 416, input, 480, m2, n2, + m3, n3); + DUP4_ARG2(__lsx_vld, input, 48, input, 112, input, 176, input, 240, m4, n4, + m5, n5); + DUP4_ARG2(__lsx_vld, input, 304, input, 368, input, 432, input, 496, m6, n6, + m7, n7); + + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + + __lsx_vst(m0, tmp_buf, 256); + __lsx_vst(n0, tmp_buf, 272); + __lsx_vst(m1, tmp_buf, 288); + __lsx_vst(n1, tmp_buf, 304); + __lsx_vst(m2, tmp_buf, 320); + __lsx_vst(n2, tmp_buf, 336); + __lsx_vst(m3, tmp_buf, 352); + __lsx_vst(n3, tmp_buf, 368); + __lsx_vst(m4, tmp_buf, 384); + __lsx_vst(n4, tmp_buf, 400); + __lsx_vst(m5, tmp_buf, 416); + __lsx_vst(n5, tmp_buf, 432); + __lsx_vst(m6, tmp_buf, 448); + __lsx_vst(n6, tmp_buf, 464); + __lsx_vst(m7, tmp_buf, 480); + __lsx_vst(n7, tmp_buf, 496); +} + +static void idct32x8_row_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + __m128i tmp0; + + /* Even stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 64, tmp_buf, 128, tmp_buf, 192, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 320, tmp_buf, 384, tmp_buf, 448, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 32, tmp_buf, 96, tmp_buf, 160, tmp_buf, 224, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 288, tmp_buf, 352, tmp_buf, 416, tmp_buf, 480, + reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = __lsx_vadd_h(reg0, reg4); + reg0 = __lsx_vsub_h(reg0, reg4); + reg4 = __lsx_vadd_h(reg6, reg2); + reg6 = __lsx_vsub_h(reg6, reg2); + reg2 = __lsx_vadd_h(reg1, reg5); + reg1 = __lsx_vsub_h(reg1, reg5); + reg5 = __lsx_vadd_h(reg7, reg3); + reg7 = __lsx_vsub_h(reg7, reg3); + reg3 = vec0; + + vec1 = reg2; + reg2 = __lsx_vadd_h(reg3, reg4); + reg3 = __lsx_vsub_h(reg3, reg4); + reg4 = __lsx_vsub_h(reg5, vec1); + reg5 = __lsx_vadd_h(reg5, vec1); + + tmp0 = __lsx_vneg_h(reg6); + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = __lsx_vsub_h(reg0, reg6); + reg0 = __lsx_vadd_h(reg0, reg6); + vec1 = __lsx_vsub_h(reg7, reg1); + reg7 = __lsx_vadd_h(reg7, reg1); + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 240); + __lsx_vst(loc1, tmp_eve_buf, 0); + __lsx_vst(loc2, tmp_eve_buf, 224); + __lsx_vst(loc3, tmp_eve_buf, 16); + + LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 208); + __lsx_vst(loc1, tmp_eve_buf, 32); + __lsx_vst(loc2, tmp_eve_buf, 192); + __lsx_vst(loc3, tmp_eve_buf, 48); + + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 176); + __lsx_vst(loc1, tmp_eve_buf, 64); + __lsx_vst(loc2, tmp_eve_buf, 160); + __lsx_vst(loc3, tmp_eve_buf, 80); + + LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 144); + __lsx_vst(loc1, tmp_eve_buf, 96); + __lsx_vst(loc2, tmp_eve_buf, 128); + __lsx_vst(loc3, tmp_eve_buf, 112); +} + +static void idct32x8_row_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 16, tmp_buf, 112, tmp_buf, 144, tmp_buf, 240, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 272, tmp_buf, 368, tmp_buf, 400, tmp_buf, 496, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = __lsx_vadd_h(reg0, reg3); + reg0 = __lsx_vsub_h(reg0, reg3); + reg3 = __lsx_vadd_h(reg7, reg4); + reg7 = __lsx_vsub_h(reg7, reg4); + reg4 = __lsx_vadd_h(reg1, reg2); + reg1 = __lsx_vsub_h(reg1, reg2); + reg2 = __lsx_vadd_h(reg6, reg5); + reg6 = __lsx_vsub_h(reg6, reg5); + reg5 = vec0; + + /* 4 Stores */ + DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 64); + __lsx_vst(vec1, tmp_odd_buf, 80); + + DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 0); + __lsx_vst(vec1, tmp_odd_buf, 16); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 96); + __lsx_vst(vec1, tmp_odd_buf, 112); + + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + __lsx_vst(vec2, tmp_odd_buf, 32); + __lsx_vst(vec3, tmp_odd_buf, 48); + + /* Odd stage 2 */ + /* 8 loads */ + DUP4_ARG2(__lsx_vld, tmp_buf, 48, tmp_buf, 80, tmp_buf, 176, tmp_buf, 208, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 304, tmp_buf, 336, tmp_buf, 432, tmp_buf, 464, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, + vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + + LSX_BUTTERFLY_4_H(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 192); + __lsx_vst(vec1, tmp_odd_buf, 240); + + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 160); + __lsx_vst(vec1, tmp_odd_buf, 176); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vadd_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, + vec2, vec0, vec3); + LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + __lsx_vst(reg0, tmp_odd_buf, 208); + __lsx_vst(reg1, tmp_odd_buf, 224); + + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + __lsx_vst(reg0, tmp_odd_buf, 128); + __lsx_vst(reg1, tmp_odd_buf, 144); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32, + tmp_odd_buf, 48, reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160, + tmp_odd_buf, 176, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 0); + __lsx_vst(loc1, tmp_odd_buf, 16); + __lsx_vst(loc2, tmp_odd_buf, 32); + __lsx_vst(loc3, tmp_odd_buf, 48); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 128); + __lsx_vst(loc1, tmp_odd_buf, 144); + __lsx_vst(loc2, tmp_odd_buf, 160); + __lsx_vst(loc3, tmp_odd_buf, 176); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96, + tmp_odd_buf, 112, reg1, reg2, reg0, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224, + tmp_odd_buf, 240, reg4, reg5, reg6, reg7); + + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 64); + __lsx_vst(loc1, tmp_odd_buf, 80); + __lsx_vst(loc2, tmp_odd_buf, 96); + __lsx_vst(loc3, tmp_odd_buf, 112); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 192); + __lsx_vst(loc1, tmp_odd_buf, 208); + __lsx_vst(loc2, tmp_odd_buf, 224); + __lsx_vst(loc3, tmp_odd_buf, 240); +} + +static void idct_butterfly_transpose_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, int16_t *dst) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + __m128i reg0, reg1, reg2, reg3; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224, + tmp_odd_buf, 96, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64, + tmp_eve_buf, 192, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, + m4, m2, m6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 496); + __lsx_vst(reg1, tmp_buf, 368); + __lsx_vst(reg2, tmp_buf, 432); + __lsx_vst(reg3, tmp_buf, 304); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160, + tmp_odd_buf, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96, + tmp_eve_buf, 224, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, + m5, m3, m7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 464); + __lsx_vst(reg1, tmp_buf, 336); + __lsx_vst(reg2, tmp_buf, 400); + __lsx_vst(reg3, tmp_buf, 272); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192, + tmp_odd_buf, 112, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80, + tmp_eve_buf, 208, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, + n4, n2, n6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 480); + __lsx_vst(reg1, tmp_buf, 352); + __lsx_vst(reg2, tmp_buf, 416); + __lsx_vst(reg3, tmp_buf, 288); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128, + tmp_odd_buf, 16, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112, + tmp_eve_buf, 240, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, + n5, n3, n7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 448); + __lsx_vst(reg1, tmp_buf, 320); + __lsx_vst(reg2, tmp_buf, 384); + __lsx_vst(reg3, tmp_buf, 256); + + /* Transpose : 16 vectors */ + /* 1st & 2nd 8x8 */ + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + __lsx_vst(m0, dst, 0); + __lsx_vst(n0, dst, 64); + __lsx_vst(m1, dst, 128); + __lsx_vst(n1, dst, 192); + __lsx_vst(m2, dst, 256); + __lsx_vst(n2, dst, 320); + __lsx_vst(m3, dst, 384); + __lsx_vst(n3, dst, 448); + + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + __lsx_vst(m4, dst, 16); + __lsx_vst(n4, dst, 80); + __lsx_vst(m5, dst, 144); + __lsx_vst(n5, dst, 208); + __lsx_vst(m6, dst, 272); + __lsx_vst(n6, dst, 336); + __lsx_vst(m7, dst, 400); + __lsx_vst(n7, dst, 464); + + /* 3rd & 4th 8x8 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 272, tmp_buf, 288, tmp_buf, 304, + m0, n0, m1, n1); + DUP4_ARG2(__lsx_vld, tmp_buf, 320, tmp_buf, 336, tmp_buf, 352, tmp_buf, 368, + m2, n2, m3, n3); + DUP4_ARG2(__lsx_vld, tmp_buf, 384, tmp_buf, 400, tmp_buf, 416, tmp_buf, 432, + m4, n4, m5, n5); + DUP4_ARG2(__lsx_vld, tmp_buf, 448, tmp_buf, 464, tmp_buf, 480, tmp_buf, 496, + m6, n6, m7, n7); + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + __lsx_vst(m0, dst, 32); + __lsx_vst(n0, dst, 96); + __lsx_vst(m1, dst, 160); + __lsx_vst(n1, dst, 224); + __lsx_vst(m2, dst, 288); + __lsx_vst(n2, dst, 352); + __lsx_vst(m3, dst, 416); + __lsx_vst(n3, dst, 480); + __lsx_vst(m4, dst, 48); + __lsx_vst(n4, dst, 112); + __lsx_vst(m5, dst, 176); + __lsx_vst(n5, dst, 240); + __lsx_vst(m6, dst, 304); + __lsx_vst(n6, dst, 368); + __lsx_vst(m7, dst, 432); + __lsx_vst(n7, dst, 496); +} + +static void idct32x8_1d_rows_lsx(const int16_t *input, int16_t *output) { + DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct32x8_row_transpose_store(input, &tmp_buf[0]); + idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); + idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); + idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], + output); +} + +static void idct8x32_column_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + __m128i tmp0; + + /* Even stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf, + 1792, reg4, reg5, reg6, reg7); + tmp_buf += 64; + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + /* Load 8 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf, + 1792, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = __lsx_vadd_h(reg0, reg4); + reg0 = __lsx_vsub_h(reg0, reg4); + reg4 = __lsx_vadd_h(reg6, reg2); + reg6 = __lsx_vsub_h(reg6, reg2); + reg2 = __lsx_vadd_h(reg1, reg5); + reg1 = __lsx_vsub_h(reg1, reg5); + reg5 = __lsx_vadd_h(reg7, reg3); + reg7 = __lsx_vsub_h(reg7, reg3); + reg3 = vec0; + + vec1 = reg2; + reg2 = __lsx_vadd_h(reg3, reg4); + reg3 = __lsx_vsub_h(reg3, reg4); + reg4 = __lsx_vsub_h(reg5, vec1); + reg5 = __lsx_vadd_h(reg5, vec1); + + tmp0 = __lsx_vneg_h(reg6); + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = __lsx_vsub_h(reg0, reg6); + reg0 = __lsx_vadd_h(reg0, reg6); + vec1 = __lsx_vsub_h(reg7, reg1); + reg7 = __lsx_vadd_h(reg7, reg1); + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 0); + __lsx_vst(loc3, tmp_eve_buf, 16); + __lsx_vst(loc2, tmp_eve_buf, 224); + __lsx_vst(loc0, tmp_eve_buf, 240); + + LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 32); + __lsx_vst(loc3, tmp_eve_buf, 48); + __lsx_vst(loc2, tmp_eve_buf, 192); + __lsx_vst(loc0, tmp_eve_buf, 208); + + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 64); + __lsx_vst(loc3, tmp_eve_buf, 80); + __lsx_vst(loc2, tmp_eve_buf, 160); + __lsx_vst(loc0, tmp_eve_buf, 176); + + LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 96); + __lsx_vst(loc3, tmp_eve_buf, 112); + __lsx_vst(loc2, tmp_eve_buf, 128); + __lsx_vst(loc0, tmp_eve_buf, 144); +} + +static void idct8x32_column_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 448, tmp_buf, 576, tmp_buf, 960, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1088, tmp_buf, 1472, tmp_buf, 1600, tmp_buf, + 1984, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = __lsx_vadd_h(reg0, reg3); + reg0 = __lsx_vsub_h(reg0, reg3); + reg3 = __lsx_vadd_h(reg7, reg4); + reg7 = __lsx_vsub_h(reg7, reg4); + reg4 = __lsx_vadd_h(reg1, reg2); + reg1 = __lsx_vsub_h(reg1, reg2); + reg2 = __lsx_vadd_h(reg6, reg5); + reg6 = __lsx_vsub_h(reg6, reg5); + reg5 = vec0; + + /* 4 Stores */ + DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 64); + __lsx_vst(vec1, tmp_odd_buf, 80); + DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 0); + __lsx_vst(vec1, tmp_odd_buf, 16); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 96); + __lsx_vst(vec1, tmp_odd_buf, 112); + __lsx_vst(vec2, tmp_odd_buf, 32); + __lsx_vst(vec3, tmp_odd_buf, 48); + + /* Odd stage 2 */ + /* 8 loads */ + DUP4_ARG2(__lsx_vld, tmp_buf, 192, tmp_buf, 320, tmp_buf, 704, tmp_buf, 832, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1216, tmp_buf, 1344, tmp_buf, 1728, tmp_buf, + 1856, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, + vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); + __lsx_vst(vec0, tmp_odd_buf, 192); + __lsx_vst(vec1, tmp_odd_buf, 240); + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 160); + __lsx_vst(vec1, tmp_odd_buf, 176); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, + vec1, vec2, vec3); + LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + __lsx_vst(reg0, tmp_odd_buf, 208); + __lsx_vst(reg1, tmp_odd_buf, 224); + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + __lsx_vst(reg0, tmp_odd_buf, 128); + __lsx_vst(reg1, tmp_odd_buf, 144); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32, + tmp_odd_buf, 48, reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160, + tmp_odd_buf, 176, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 0); + __lsx_vst(loc1, tmp_odd_buf, 16); + __lsx_vst(loc2, tmp_odd_buf, 32); + __lsx_vst(loc3, tmp_odd_buf, 48); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 128); + __lsx_vst(loc1, tmp_odd_buf, 144); + __lsx_vst(loc2, tmp_odd_buf, 160); + __lsx_vst(loc3, tmp_odd_buf, 176); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96, + tmp_odd_buf, 112, reg1, reg2, reg0, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224, + tmp_odd_buf, 240, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 64); + __lsx_vst(loc1, tmp_odd_buf, 80); + __lsx_vst(loc2, tmp_odd_buf, 96); + __lsx_vst(loc3, tmp_odd_buf, 112); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 192); + __lsx_vst(loc1, tmp_odd_buf, 208); + __lsx_vst(loc2, tmp_odd_buf, 224); + __lsx_vst(loc3, tmp_odd_buf, 240); +} + +static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, uint8_t *dst, + int32_t dst_stride) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + int32_t stride = dst_stride << 2; + int32_t stride2 = stride << 1; + int32_t stride3 = stride + stride2; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224, + tmp_odd_buf, 96, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64, + tmp_eve_buf, 192, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, + m4, m2, m6); + DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6); + VP9_ADDBLK_ST8x4_UB(dst, stride, stride2, stride3, m0, m2, m4, m6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, + m2, m4, m0); + DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6); + VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), stride, stride2, stride3, m0, m2, + m4, m6); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160, + tmp_odd_buf, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96, + tmp_eve_buf, 224, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, + m5, m3, m7); + DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7); + VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), stride, stride2, stride3, m1, m3, + m5, m7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, + m3, m5, m1); + DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7); + VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), stride, stride2, stride3, m1, m3, + m5, m7); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192, + tmp_odd_buf, 112, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80, + tmp_eve_buf, 208, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, + n4, n2, n6); + DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6); + VP9_ADDBLK_ST8x4_UB((dst + dst_stride), stride, stride2, stride3, n0, n2, n4, + n6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, + n2, n4, n0); + DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6); + VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), stride, stride2, stride3, n0, n2, + n4, n6); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128, + tmp_odd_buf, 16, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112, + tmp_eve_buf, 240, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, + n5, n3, n7); + DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7); + VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), stride, stride2, stride3, n1, n3, + n5, n7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, + n3, n5, n1); + DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7); + VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), stride, stride2, stride3, n1, n3, + n5, n7); +} + +static void idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); + idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); + idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, + dst_stride); +} + +void vpx_idct32x32_1024_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + /* transform rows */ + for (i = 0; i < 4; ++i) { + /* process 32 * 8 block */ + idct32x8_1d_rows_lsx((input + (i << 8)), (out_ptr + (i << 8))); + } + + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct32x32_34_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + __m128i zero = __lsx_vldi(0); + + for (i = 32; i--;) { + __lsx_vst(zero, out_ptr, 0); + __lsx_vst(zero, out_ptr, 16); + __lsx_vst(zero, out_ptr, 32); + __lsx_vst(zero, out_ptr, 48); + out_ptr += 32; + } + + out_ptr = out_arr; + + /* rows: only upper-left 8x8 has non-zero coeff */ + idct32x8_1d_rows_lsx(input, out_ptr); + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct32x32_1_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + int16_t out; + __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __lsx_vreplgr2vr_h(out); + + for (i = 16; i--;) { + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + dst2 = __lsx_vldx(dst, dst_stride); + dst3 = __lsx_vldx(dst + 16, dst_stride); + + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + + DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0, + res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, res4, + res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res4, res0, 0, res5, res1, 0, res6, res2, 0, + res7, res3, 0, tmp0, tmp1, tmp2, tmp3); + __lsx_vst(tmp0, dst, 0); + __lsx_vst(tmp1, dst, 16); + dst += dst_stride; + __lsx_vst(tmp2, dst, 0); + __lsx_vst(tmp3, dst, 16); + dst += dst_stride; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c new file mode 100644 index 0000000000..f990211791 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Lu Wang <wanglu@loongson.cn> + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static inline void intra_predict_dc_8x8_lsx(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint64_t val0, val1; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i store, sum_h, sum_w, sum_d; + __m128i src = { 0 }; + + val0 = *(const uint64_t *)src_top; + val1 = *(const uint64_t *)src_left; + DUP2_ARG3(__lsx_vinsgr2vr_d, src, val0, 0, src, val1, 1, src, src); + sum_h = __lsx_vhaddw_hu_bu(src, src); + sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vpickev_w(sum_d, sum_d); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vsrari_w(sum_d, 4); + store = __lsx_vreplvei_b(sum_w, 0); + + __lsx_vstelm_d(store, dst, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0); + dst += dst_stride_x4; + __lsx_vstelm_d(store, dst, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0); +} + +static inline void intra_predict_dc_16x16_lsx(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t dst_stride) { + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i top, left, out; + __m128i sum_h, sum_top, sum_left; + __m128i sum_w; + __m128i sum_d; + + DUP2_ARG2(__lsx_vld, src_top, 0, src_left, 0, top, left); + DUP2_ARG2(__lsx_vhaddw_hu_bu, top, top, left, left, sum_top, sum_left); + sum_h = __lsx_vadd_h(sum_top, sum_left); + sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vpickev_w(sum_d, sum_d); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vsrari_w(sum_d, 5); + out = __lsx_vreplvei_b(sum_w, 0); + + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); +} + +void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_8x8_lsx(above, left, dst, y_stride); +} + +void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_16x16_lsx(above, left, dst, y_stride); +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c new file mode 100644 index 0000000000..0503df9966 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c @@ -0,0 +1,1320 @@ +/* + * Copyright (c) 2022 Loongson Technology Corporation Limited + * Contributed by Hecai Yuan <yuanhecai@loongson.cn> + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" +#include "vpx_ports/mem.h" + +#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \ + _in2, _in3, _in4, _in5, _in6, _in7) \ + do { \ + _in0 = __lsx_vld(_src, 0); \ + _in1 = __lsx_vldx(_src, _stride); \ + _in2 = __lsx_vldx(_src, _stride2); \ + _in3 = __lsx_vldx(_src, _stride3); \ + _src += _stride4; \ + _in4 = __lsx_vld(_src, 0); \ + _in5 = __lsx_vldx(_src, _stride); \ + _in6 = __lsx_vldx(_src, _stride2); \ + _in7 = __lsx_vldx(_src, _stride3); \ + } while (0) + +#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \ + _stride, _stride2, _stride3, _stride4) \ + do { \ + __lsx_vst(_dst0, _dst, 0); \ + __lsx_vstx(_dst1, _dst, _stride); \ + __lsx_vstx(_dst2, _dst, _stride2); \ + __lsx_vstx(_dst3, _dst, _stride3); \ + _dst += _stride4; \ + __lsx_vst(_dst4, _dst, 0); \ + __lsx_vstx(_dst5, _dst, _stride); \ + __lsx_vstx(_dst6, _dst, _stride2); \ + __lsx_vstx(_dst7, _dst, _stride3); \ + } while (0) + +static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride, + uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__lsx_bz_v(flat)) { + __lsx_vstx(p1_out, dst, -stride2); + __lsx_vstx(p0_out, dst, -stride); + __lsx_vst(q0_out, dst, 0); + __lsx_vstx(q1_out, dst, stride); + + return 1; + } + + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filt8_l, flat, p1_out, p1_filt8_l, flat, + p0_out, p0_filt8_l, flat, q0_out, q0_filt8_l, flat, p2_out, p1_out, + p0_out, q0_out); + DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filt8_l, flat, q2, q2_filt8_l, flat, + q1_out, q2_out); + + __lsx_vst(p2_out, filter48, 0); + __lsx_vst(p1_out, filter48, 16); + __lsx_vst(p0_out, filter48, 32); + __lsx_vst(q0_out, filter48, 48); + __lsx_vst(q1_out, filter48, 64); + __lsx_vst(q2_out, filter48, 80); + __lsx_vst(flat, filter48, 96); + + return 0; +} + +static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + uint8_t *dst_tmp0 = dst - stride4; + uint8_t *dst_tmp1 = dst + stride4; + + __m128i flat, flat2, filter8; + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + __m128i out_h, out_l; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; + v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; + v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in; + v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in; + v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in; + v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in; + v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h; + + flat = __lsx_vld(filter48, 96); + + DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0, + -stride2, dst_tmp0, -stride, p7, p6, p5, p4); + + p3 = __lsx_vld(dst_tmp0, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp0, stride, dst_tmp0, stride2, p2, p1); + p0 = __lsx_vldx(dst_tmp0, stride3); + + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + q4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6); + q7 = __lsx_vldx(dst_tmp1, stride3); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__lsx_bz_v(flat2)) { + DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48, + p2, p1, p0, q0); + DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2); + __lsx_vstx(p2, dst, -stride3); + __lsx_vstx(p1, dst, -stride2); + __lsx_vstx(p0, dst, -stride); + __lsx_vst(q0, dst, 0); + __lsx_vstx(q1, dst, stride); + __lsx_vstx(q2, dst, stride2); + } else { + dst = dst_tmp0 - stride3; + + p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0); + p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0); + p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0); + p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0); + p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0); + p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0); + p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0); + p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0); + q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7); + p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6); + p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5); + p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4); + p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3); + p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2); + p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1); + p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0); + q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0); + + tmp0_h = p7_h_in << 3; + tmp0_h -= p7_h_in; + tmp0_h += p6_h_in; + tmp0_h += q0_h_in; + tmp1_h = p6_h_in + p5_h_in; + tmp1_h += p4_h_in; + tmp1_h += p3_h_in; + tmp1_h += p2_h_in; + tmp1_h += p1_h_in; + tmp1_h += p0_h_in; + tmp1_h += tmp0_h; + + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p6 = __lsx_vbitsel_v(p6, out_l, flat2); + __lsx_vst(p6, dst, 0); + dst += stride; + + /* p5 */ + q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1); + tmp0_h = p5_h_in - p6_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p5 = __lsx_vbitsel_v(p5, out_l, flat2); + __lsx_vst(p5, dst, 0); + dst += stride; + + /* p4 */ + q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2); + tmp0_h = p4_h_in - p5_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p4 = __lsx_vbitsel_v(p4, out_l, flat2); + __lsx_vst(p4, dst, 0); + dst += stride; + + /* p3 */ + q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3); + tmp0_h = p3_h_in - p4_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p3 = __lsx_vbitsel_v(p3, out_l, flat2); + __lsx_vst(p3, dst, 0); + dst += stride; + + /* p2 */ + q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0); + filter8 = __lsx_vld(filter48, 0); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4); + tmp0_h = p2_h_in - p3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* p1 */ + q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0); + filter8 = __lsx_vld(filter48, 16); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5); + tmp0_h = p1_h_in - p2_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* p0 */ + q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0); + filter8 = __lsx_vld(filter48, 32); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6); + tmp0_h = p0_h_in - p1_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q0 */ + q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0); + filter8 = __lsx_vld(filter48, 48); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7); + tmp0_h = q7_h_in - p0_h_in; + tmp0_h += q0_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q1 */ + filter8 = __lsx_vld(filter48, 64); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q0_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p6_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q2 */ + filter8 = __lsx_vld(filter48, 80); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q1_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p5_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q3 */ + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q2_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p4_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q3 = __lsx_vbitsel_v(q3, out_l, flat2); + __lsx_vst(q3, dst, 0); + dst += stride; + + /* q4 */ + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p3_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q4 = __lsx_vbitsel_v(q4, out_l, flat2); + __lsx_vst(q4, dst, 0); + dst += stride; + + /* q5 */ + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q4_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p2_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q5 = __lsx_vbitsel_v(q5, out_l, flat2); + __lsx_vst(q5, dst, 0); + dst += stride; + + /* q6 */ + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q5_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p1_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q6 = __lsx_vbitsel_v(q6, out_l, flat2); + __lsx_vst(q6, dst, 0); + } +} + +static void mb_lpf_horizontal_edge_dual(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + DECLARE_ALIGNED(16, uint8_t, filter48[16 * 8]); + uint8_t early_exit = 0; + + early_exit = hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); + + if (early_exit == 0) { + hz_lpf_t16_16w(dst, stride, filter48); + } +} + +static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + if (count == 1) { + __m128i flat2, mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i p0_filter16, p1_filter16; + __m128i p2_filter8, p1_filter8, p0_filter8; + __m128i q0_filter8, q1_filter8, q2_filter8; + __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l; + __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l; + __m128i zero = __lsx_vldi(0); + __m128i tmp0, tmp1, tmp2; + + int32_t stride2 = stride << 1; + int32_t stride3 = 2 + stride; + int32_t stride4 = stride << 2; + uint8_t *dst_tmp0 = dst - stride4; + uint8_t *dst_tmp1 = dst + stride4; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + /* filter_mask* */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + flat = __lsx_vilvl_d(zero, flat); + if (__lsx_bz_v(flat)) { + __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); + __lsx_vstelm_d(p0_out, dst - stride, 0, 0); + __lsx_vstelm_d(q0_out, dst, 0, 0); + __lsx_vstelm_d(q1_out, dst + stride, 0, 0); + } else { + /* convert 8 bit input data into 16 bit */ + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, + p2_l, p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, + q1_l, q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero, + p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8, + p0_filter8, q0_filter8); + DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8, + q2_filter8); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat); + + /* load 16 vector elements */ + DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0, + -stride2, dst_tmp0, -stride, p7, p6, p5, p4); + q4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6); + q7 = __lsx_vldx(dst_tmp1, stride3); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__lsx_bz_v(flat2)) { + dst -= stride3; + __lsx_vstelm_d(p2_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p0_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q0_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q1_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q2_out, dst, 0, 0); + } else { + /* LSB(right) 8 pixel operation */ + DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4, p7_l, + p6_l, p5_l, p4_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7, q4_l, + q5_l, q6_l, q7_l); + + tmp0 = __lsx_vslli_h(p7_l, 3); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp0 = __lsx_vadd_h(tmp0, p6_l); + tmp0 = __lsx_vadd_h(tmp0, q0_l); + + dst = dst_tmp0 - stride3; + + /* calculation of p6 and p5 */ + tmp1 = __lsx_vadd_h(p6_l, p5_l); + tmp1 = __lsx_vadd_h(tmp1, p4_l); + tmp1 = __lsx_vadd_h(tmp1, p3_l); + tmp1 = __lsx_vadd_h(tmp1, p2_l); + tmp1 = __lsx_vadd_h(tmp1, p1_l); + tmp1 = __lsx_vadd_h(tmp1, p0_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp0 = __lsx_vsub_h(p5_l, p6_l); + tmp0 = __lsx_vadd_h(tmp0, q1_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p6, p0_filter16, flat2, p5, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p4 and p3 */ + tmp0 = __lsx_vsub_h(p4_l, p5_l); + tmp0 = __lsx_vadd_h(tmp0, q2_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(p3_l, p4_l); + tmp2 = __lsx_vadd_h(tmp2, q3_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p4, p0_filter16, flat2, p3, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p2 and p1 */ + tmp0 = __lsx_vsub_h(p2_l, p3_l); + tmp0 = __lsx_vadd_h(tmp0, q4_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(p1_l, p2_l); + tmp2 = __lsx_vadd_h(tmp2, q5_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p2_out, p0_filter16, flat2, p1_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p0 and q0 */ + tmp0 = __lsx_vsub_h(p0_l, p1_l); + tmp0 = __lsx_vadd_h(tmp0, q6_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(q7_l, p0_l); + tmp2 = __lsx_vadd_h(tmp2, q0_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p0_out, p0_filter16, flat2, q0_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q1 and q2 */ + tmp0 = __lsx_vsub_h(q7_l, q0_l); + tmp0 = __lsx_vadd_h(tmp0, q1_l); + tmp0 = __lsx_vsub_h(tmp0, p6_l); + tmp2 = __lsx_vsub_h(q7_l, q1_l); + tmp2 = __lsx_vadd_h(tmp2, q2_l); + tmp2 = __lsx_vsub_h(tmp2, p5_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q1_out, p0_filter16, flat2, q2_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q3 and q4 */ + tmp0 = __lsx_vsub_h(q7_l, q2_l); + tmp0 = __lsx_vadd_h(tmp0, q3_l); + tmp0 = __lsx_vsub_h(tmp0, p4_l); + tmp2 = __lsx_vsub_h(q7_l, q3_l); + tmp2 = __lsx_vadd_h(tmp2, q4_l); + tmp2 = __lsx_vsub_h(tmp2, p3_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q3, p0_filter16, flat2, q4, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q5 and q6 */ + tmp0 = __lsx_vsub_h(q7_l, q4_l); + tmp0 = __lsx_vadd_h(tmp0, q5_l); + tmp0 = __lsx_vsub_h(tmp0, p2_l); + tmp2 = __lsx_vsub_h(q7_l, q5_l); + tmp2 = __lsx_vadd_h(tmp2, q6_l); + tmp2 = __lsx_vsub_h(tmp2, p1_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q5, p0_filter16, flat2, q6, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + } + } + } else { + mb_lpf_horizontal_edge_dual(dst, stride, b_limit_ptr, limit_ptr, + thresh_ptr); + } +} + +void vpx_lpf_horizontal_16_dual_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(dst, stride, b_limit_ptr, limit_ptr, thresh_ptr, 2); +} + +static void transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output, + int32_t out_stride) { + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i row8, row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; + __m128i tmp2, tmp3; + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + int32_t in_stride2 = in_stride << 1; + int32_t in_stride3 = in_stride2 + in_stride; + int32_t in_stride4 = in_stride2 << 1; + int32_t out_stride2 = out_stride << 1; + int32_t out_stride3 = out_stride2 + out_stride; + int32_t out_stride4 = out_stride2 << 1; + + LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row0, row1, + row2, row3, row4, row5, row6, row7); + input += in_stride4; + LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row8, row9, + row10, row11, row12, row13, row14, row15); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p7, p6, + p5, p4, p3, p2, p1, p0); + + /* transpose 16x8 matrix into 8x16 */ + /* total 8 intermediate register and 32 instructions */ + q7 = __lsx_vpackod_d(row8, row0); + q6 = __lsx_vpackod_d(row9, row1); + q5 = __lsx_vpackod_d(row10, row2); + q4 = __lsx_vpackod_d(row11, row3); + q3 = __lsx_vpackod_d(row12, row4); + q2 = __lsx_vpackod_d(row13, row5); + q1 = __lsx_vpackod_d(row14, row6); + q0 = __lsx_vpackod_d(row15, row7); + + DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1); + DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5); + + DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7); + DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7); + + DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3); + q0 = __lsx_vpackev_w(tmp3, tmp2); + q4 = __lsx_vpackod_w(tmp3, tmp2); + + tmp2 = __lsx_vpackod_h(tmp1, tmp0); + tmp3 = __lsx_vpackod_h(q7, q5); + q2 = __lsx_vpackev_w(tmp3, tmp2); + q6 = __lsx_vpackod_w(tmp3, tmp2); + + DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3); + q1 = __lsx_vpackev_w(tmp3, tmp2); + q5 = __lsx_vpackod_w(tmp3, tmp2); + + tmp2 = __lsx_vpackod_h(tmp5, tmp4); + tmp3 = __lsx_vpackod_h(tmp7, tmp6); + q3 = __lsx_vpackev_w(tmp3, tmp2); + q7 = __lsx_vpackod_w(tmp3, tmp2); + + LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride, out_stride2, + out_stride3, out_stride4); + output += out_stride4; + LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride, out_stride2, + out_stride3, out_stride4); +} + +static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48, + uint8_t *dst_org, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + + /* load vector elements */ + DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); + vec2 = __lsx_vilvl_h(vec1, vec0); + vec3 = __lsx_vilvh_h(vec1, vec0); + DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); + vec4 = __lsx_vilvl_h(vec1, vec0); + vec5 = __lsx_vilvh_h(vec1, vec0); + + dst_org -= 2; + __lsx_vstelm_w(vec2, dst_org, 0, 0); + __lsx_vstelm_w(vec2, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec3, dst_org, 0, 0); + __lsx_vstelm_w(vec3, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec4, dst_org, 0, 0); + __lsx_vstelm_w(vec4, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec5, dst_org, 0, 0); + __lsx_vstelm_w(vec5, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3); + + return 1; + } + + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + __lsx_vst(p2_out, filter48, 0); + __lsx_vst(p1_out, filter48, 16); + __lsx_vst(p0_out, filter48, 32); + __lsx_vst(q0_out, filter48, 48); + __lsx_vst(q1_out, filter48, 64); + __lsx_vst(q2_out, filter48, 80); + __lsx_vst(flat, filter48, 96); + + return 0; +} + +static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, + uint8_t *filter48) { + __m128i flat, flat2, filter8; + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + __m128i out_l, out_h; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; + v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; + v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in; + v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in; + v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in; + v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in; + v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h; + uint8_t *dst_tmp = dst - 128; + + flat = __lsx_vld(filter48, 96); + + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, p7, + p6, p5, p4); + DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, dst_tmp, 112, p3, + p2, p1, p0); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); + DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + /* if flat2 is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat2)) { + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48, + p2, p1, p0, q0); + DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2); + + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); + vec3 = __lsx_vilvl_h(vec1, vec0); + vec4 = __lsx_vilvh_h(vec1, vec0); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1); + vec6 = __lsx_vilvl_h(vec1, vec0); + vec7 = __lsx_vilvh_h(vec1, vec0); + vec2 = __lsx_vilvl_b(q2, q1); + vec5 = __lsx_vilvh_b(q2, q1); + + dst_org -= 3; + __lsx_vstelm_w(vec3, dst_org, 0, 0); + __lsx_vstelm_h(vec2, dst_org, 4, 0); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 1); + __lsx_vstelm_h(vec2, dst_org, 4, 1); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 2); + __lsx_vstelm_h(vec2, dst_org, 4, 2); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 3); + __lsx_vstelm_h(vec2, dst_org, 4, 3); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 0); + __lsx_vstelm_h(vec2, dst_org, 4, 4); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 1); + __lsx_vstelm_h(vec2, dst_org, 4, 5); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 2); + __lsx_vstelm_h(vec2, dst_org, 4, 6); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 3); + __lsx_vstelm_h(vec2, dst_org, 4, 7); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 0); + __lsx_vstelm_h(vec5, dst_org, 4, 0); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 1); + __lsx_vstelm_h(vec5, dst_org, 4, 1); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 2); + __lsx_vstelm_h(vec5, dst_org, 4, 2); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 3); + __lsx_vstelm_h(vec5, dst_org, 4, 3); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 0); + __lsx_vstelm_h(vec5, dst_org, 4, 4); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 1); + __lsx_vstelm_h(vec5, dst_org, 4, 5); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 2); + __lsx_vstelm_h(vec5, dst_org, 4, 6); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 3); + __lsx_vstelm_h(vec5, dst_org, 4, 7); + + return 1; + } + + dst -= 7 * 16; + + p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0); + p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0); + p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0); + p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0); + p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0); + p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0); + p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0); + p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0); + q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7); + p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6); + p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5); + p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4); + p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3); + p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2); + p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1); + p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0); + q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0); + + tmp0_h = p7_h_in << 3; + tmp0_h -= p7_h_in; + tmp0_h += p6_h_in; + tmp0_h += q0_h_in; + tmp1_h = p6_h_in + p5_h_in; + tmp1_h += p4_h_in; + tmp1_h += p3_h_in; + tmp1_h += p2_h_in; + tmp1_h += p1_h_in; + tmp1_h += p0_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p6 = __lsx_vbitsel_v(p6, out_l, flat2); + __lsx_vst(p6, dst, 0); + + /* p5 */ + q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1); + tmp0_h = p5_h_in - p6_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p5 = __lsx_vbitsel_v(p5, out_l, flat2); + __lsx_vst(p5, dst, 16); + + /* p4 */ + q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2); + tmp0_h = p4_h_in - p5_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p4 = __lsx_vbitsel_v(p4, out_l, flat2); + __lsx_vst(p4, dst, 16 * 2); + + /* p3 */ + q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3); + tmp0_h = p3_h_in - p4_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p3 = __lsx_vbitsel_v(p3, out_l, flat2); + __lsx_vst(p3, dst, 16 * 3); + + /* p2 */ + q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0); + filter8 = __lsx_vld(filter48, 0); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4); + tmp0_h = p2_h_in - p3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 4); + + /* p1 */ + q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0); + filter8 = __lsx_vld(filter48, 16); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5); + tmp0_h = p1_h_in - p2_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 5); + + /* p0 */ + q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0); + filter8 = __lsx_vld(filter48, 32); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6); + tmp0_h = p0_h_in - p1_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 6); + + /* q0 */ + q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0); + filter8 = __lsx_vld(filter48, 48); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7); + tmp0_h = q7_h_in - p0_h_in; + tmp0_h += q0_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 7); + + /* q1 */ + filter8 = __lsx_vld(filter48, 64); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q0_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p6_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 8); + + /* q2 */ + filter8 = __lsx_vld(filter48, 80); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q1_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p5_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 9); + + /* q3 */ + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q2_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p4_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q3 = __lsx_vbitsel_v(q3, out_l, flat2); + __lsx_vst(q3, dst, 16 * 10); + + /* q4 */ + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p3_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q4 = __lsx_vbitsel_v(q4, out_l, flat2); + __lsx_vst(q4, dst, 16 * 11); + + /* q5 */ + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q4_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p2_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q5 = __lsx_vbitsel_v(q5, out_l, flat2); + __lsx_vst(q5, dst, 16 * 12); + + /* q6 */ + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q5_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p1_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q6 = __lsx_vbitsel_v(q6, out_l, flat2); + __lsx_vst(q6, dst, 16 * 13); + + return 0; +} + +void vpx_lpf_vertical_16_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(16, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x16((src - 8), pitch, &transposed_input[0], 16); + + early_exit = + vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (early_exit == 0) { + early_exit = + vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]); + + if (early_exit == 0) { + transpose_16x16(transposed_input, 16, (src - 8), pitch); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c new file mode 100644 index 0000000000..9300b5c5ae --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" + +void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + + DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch, + p3, p2, p1, p0); + q0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2); + q3 = __lsx_vldx(src, pitch3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + __lsx_vstelm_d(p1_out, src - pitch2, 0, 0); + __lsx_vstelm_d(p0_out, src - pitch, 0, 0); + __lsx_vstelm_d(q0_out, src, 0, 0); + __lsx_vstelm_d(q1_out, src + pitch, 0, 0); +} + +void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + __m128i mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + + DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch, + p3, p2, p1, p0); + q0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2); + q3 = __lsx_vldx(src, pitch3); + + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + + __lsx_vstx(p1, src, -pitch2); + __lsx_vstx(p0, src, -pitch); + __lsx_vst(q0, src, 0); + __lsx_vstx(q1, src, pitch); +} + +void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, limit, thresh, b_limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i vec0, vec1, vec2, vec3; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + uint8_t *src_tmp = src - 4; + + p3 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, p2, p1); + p0 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + q0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2); + q3 = __lsx_vldx(src_tmp, pitch3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1); + vec2 = __lsx_vilvl_h(vec1, vec0); + vec3 = __lsx_vilvh_h(vec1, vec0); + + src -= 2; + __lsx_vstelm_w(vec2, src, 0, 0); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 1); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 2); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(vec3, src, 0, 0); + __lsx_vstelm_w(vec3, src + pitch, 0, 1); + __lsx_vstelm_w(vec3, src + pitch2, 0, 2); + __lsx_vstelm_w(vec3, src + pitch3, 0, 3); +} + +void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + __m128i mask, hev, flat; + __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i row8, row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + uint8_t *src_tmp = src - 4; + + row0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row1, row2); + row3 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row5, row6); + row7 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row8 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row9, row10); + row11 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row12 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row13, row14); + row15 = __lsx_vldx(src_tmp, pitch3); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); + tmp2 = __lsx_vilvl_h(tmp1, tmp0); + tmp3 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1); + tmp4 = __lsx_vilvl_h(tmp1, tmp0); + tmp5 = __lsx_vilvh_h(tmp1, tmp0); + + src -= 2; + __lsx_vstelm_w(tmp2, src, 0, 0); + __lsx_vstelm_w(tmp2, src + pitch, 0, 1); + __lsx_vstelm_w(tmp2, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp2, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp3, src, 0, 0); + __lsx_vstelm_w(tmp3, src + pitch, 0, 1); + __lsx_vstelm_w(tmp3, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp3, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp4, src, 0, 0); + __lsx_vstelm_w(tmp4, src + pitch, 0, 1); + __lsx_vstelm_w(tmp4, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp4, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp5, src, 0, 0); + __lsx_vstelm_w(tmp5, src + pitch, 0, 1); + __lsx_vstelm_w(tmp5, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp5, src + pitch3, 0, 3); +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c new file mode 100644 index 0000000000..00219ba71d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" + +void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out; + __m128i p2_filter8, p1_filter8, p0_filter8; + __m128i q0_filter8, q1_filter8, q2_filter8; + __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l; + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = __lsx_vilvl_d(flat, flat); + + if (__lsx_bz_v(flat)) { + __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); + __lsx_vstelm_d(p0_out, dst - stride, 0, 0); + __lsx_vstelm_d(q0_out, dst, 0, 0); + __lsx_vstelm_d(q1_out, dst + stride, 0, 0); + } else { + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8, + p1_filter8, q0_filter8); + q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8); + + p2 = __lsx_vilvl_d(p1_out, p2); + p0_out = __lsx_vilvl_d(q0_out, p0_out); + q1_out = __lsx_vilvl_d(q2, q1_out); + + DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat, + p2_out, p1_out); + p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat); + dst -= stride3; + + __lsx_vstelm_d(p2_out, dst, 0, 0); + __lsx_vstelm_d(p2_out, dst + stride, 0, 1); + __lsx_vstelm_d(p1_out, dst + stride2, 0, 0); + __lsx_vstelm_d(p1_out, dst + stride3, 0, 1); + + dst += stride4; + __lsx_vstelm_d(p0_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p0_out, dst, 0, 1); + } +} + +void vpx_lpf_horizontal_8_dual_lsx( + uint8_t *dst, int32_t stride, const uint8_t *b_limit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1, + const uint8_t *limit1, const uint8_t *thresh1) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh0, 0); + p2_out = __lsx_vldrepl_b(thresh1, 0); + thresh = __lsx_vilvl_d(p2_out, thresh); + + b_limit = __lsx_vldrepl_b(b_limit0, 0); + p2_out = __lsx_vldrepl_b(b_limit1, 0); + b_limit = __lsx_vilvl_d(p2_out, b_limit); + + limit = __lsx_vldrepl_b(limit0, 0); + p2_out = __lsx_vldrepl_b(limit1, 0); + limit = __lsx_vilvl_d(p2_out, limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__lsx_bz_v(flat)) { + __lsx_vst(p1_out, dst - stride2, 0); + __lsx_vst(p0_out, dst - stride, 0); + __lsx_vst(q0_out, dst, 0); + __lsx_vst(q1_out, dst + stride, 0); + } else { + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + __lsx_vst(p2_out, dst - stride3, 0); + __lsx_vst(p1_out, dst - stride2, 0); + __lsx_vst(p0_out, dst - stride, 0); + __lsx_vst(q0_out, dst, 0); + __lsx_vst(q1_out, dst + stride, 0); + __lsx_vst(q2_out, dst + stride2, 0); + } +} + +void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p1_out, p0_out, q0_out, q1_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i zero = __lsx_vldi(0); + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + uint8_t *dst_tmp = dst - 4; + + /* load vector elements */ + p3 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1); + p0 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + q0 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2); + q3 = __lsx_vldx(dst_tmp, stride3); + + LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = __lsx_vilvl_d(zero, flat); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + /* Store 4 pixels p1-_q1 */ + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + p2 = __lsx_vilvl_h(p1, p0); + p3 = __lsx_vilvh_h(p1, p0); + + dst -= 2; + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_w(p2, dst + stride, 0, 1); + __lsx_vstelm_w(p2, dst + stride2, 0, 2); + __lsx_vstelm_w(p2, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(p3, dst, 0, 0); + __lsx_vstelm_w(p3, dst + stride, 0, 1); + __lsx_vstelm_w(p3, dst + stride2, 0, 2); + __lsx_vstelm_w(p3, dst + stride3, 0, 3); + } else { + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, + p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, + q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l, + p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + /* store pixel values */ + p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + /* Store 6 pixels p2-_q2 */ + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3); + p1 = __lsx_vilvl_h(q3, p3); + p2 = __lsx_vilvh_h(q3, p3); + p3 = __lsx_vilvl_b(q2, q1); + dst -= 3; + __lsx_vstelm_w(p1, dst, 0, 0); + __lsx_vstelm_h(p3, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(p1, dst, 0, 1); + __lsx_vstelm_h(p3, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(p1, dst, 0, 2); + __lsx_vstelm_h(p3, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(p1, dst, 0, 3); + __lsx_vstelm_h(p3, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_h(p3, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 1); + __lsx_vstelm_h(p3, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 2); + __lsx_vstelm_h(p3, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 3); + __lsx_vstelm_h(p3, dst, 4, 7); + } +} + +void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8_t *dst_tmp = dst - 4; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p1_out, p0_out, q0_out, q1_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i row4, row5, row6, row7, row12, row13, row14, row15; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + p0 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2); + p3 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + row4 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6); + row7 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + + q3 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1); + q0 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + row12 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14); + row15 = __lsx_vldx(dst_tmp, stride3); + + /* transpose 16x8 matrix into 8x16 */ + LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, + row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = __lsx_vldrepl_b(thresh0, 0); + p1_out = __lsx_vldrepl_b(thresh1, 0); + thresh = __lsx_vilvl_d(p1_out, thresh); + + b_limit = __lsx_vldrepl_b(b_limit0, 0); + p1_out = __lsx_vldrepl_b(b_limit1, 0); + b_limit = __lsx_vilvl_d(p1_out, b_limit); + + limit = __lsx_vldrepl_b(limit0, 0); + p1_out = __lsx_vldrepl_b(limit1, 0); + limit = __lsx_vilvl_d(p1_out, limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + p2 = __lsx_vilvl_h(p1, p0); + p3 = __lsx_vilvh_h(p1, p0); + DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + q2 = __lsx_vilvl_h(p1, p0); + q3 = __lsx_vilvh_h(p1, p0); + dst -= 2; + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_w(p2, dst + stride, 0, 1); + __lsx_vstelm_w(p2, dst + stride2, 0, 2); + __lsx_vstelm_w(p2, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(p3, dst, 0, 0); + __lsx_vstelm_w(p3, dst + stride, 0, 1); + __lsx_vstelm_w(p3, dst + stride2, 0, 2); + __lsx_vstelm_w(p3, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(q2, dst, 0, 0); + __lsx_vstelm_w(q2, dst + stride, 0, 1); + __lsx_vstelm_w(q2, dst + stride2, 0, 2); + __lsx_vstelm_w(q2, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(q3, dst, 0, 0); + __lsx_vstelm_w(q3, dst + stride, 0, 1); + __lsx_vstelm_w(q3, dst + stride2, 0, 2); + __lsx_vstelm_w(q3, dst + stride3, 0, 3); + } else { + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + + /* filter8 */ + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3); + p2_filt8_l = __lsx_vilvl_h(q3, p3); + p2_filt8_h = __lsx_vilvh_h(q3, p3); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3); + p0_filt8_l = __lsx_vilvl_h(q3, p3); + p0_filt8_h = __lsx_vilvh_h(q3, p3); + q1_filt8_l = __lsx_vilvl_b(q2, q1); + q1_filt8_h = __lsx_vilvh_b(q2, q1); + + dst -= 3; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 7); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 7); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h new file mode 100644 index 0000000000..1c43836503 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + do { \ + __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in); \ + p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in); \ + p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in); \ + q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in); \ + q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in); \ + q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in); \ + p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in); \ + p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = __lsx_vslt_bu(thresh_in, flat_out); \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1); \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m); \ + mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m); \ + mask_out = __lsx_vmax_bu(flat_out, mask_out); \ + p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \ + \ + mask_out = __lsx_vslt_bu(limit_in, mask_out); \ + mask_out = __lsx_vxori_b(mask_out, 0xff); \ + } while (0) + +#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + do { \ + __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0; \ + __m128i flat4_tmp = __lsx_vldi(1); \ + \ + DUP4_ARG2(__lsx_vabsd_bu, p2_in, p0_in, q2_in, q0_in, p3_in, p0_in, q3_in, \ + q0_in, p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0); \ + p2_asub_p0 = __lsx_vmax_bu(p2_asub_p0, q2_asub_q0); \ + flat_out = __lsx_vmax_bu(p2_asub_p0, flat_out); \ + p3_asub_p0 = __lsx_vmax_bu(p3_asub_p0, q3_asub_q0); \ + flat_out = __lsx_vmax_bu(p3_asub_p0, flat_out); \ + \ + flat_out = __lsx_vslt_bu(flat4_tmp, flat_out); \ + flat_out = __lsx_vxori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ + } while (0) + +#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ + q6_in, q7_in, flat_in, flat2_out) \ + do { \ + __m128i flat5_tmp = __lsx_vldi(1); \ + __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0; \ + __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0; \ + DUP4_ARG2(__lsx_vabsd_bu, p4_in, p0_in, q4_in, q0_in, p5_in, p0_in, q5_in, \ + q0_in, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0); \ + DUP4_ARG2(__lsx_vabsd_bu, p6_in, p0_in, q6_in, q0_in, p7_in, p0_in, q7_in, \ + q0_in, p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0); \ + \ + DUP2_ARG2(__lsx_vmax_bu, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0, \ + p4_asub_p0, flat2_out); \ + flat2_out = __lsx_vmax_bu(p4_asub_p0, flat2_out); \ + p6_asub_p0 = __lsx_vmax_bu(p6_asub_p0, q6_asub_q0); \ + flat2_out = __lsx_vmax_bu(p6_asub_p0, flat2_out); \ + p7_asub_p0 = __lsx_vmax_bu(p7_asub_p0, q7_asub_q0); \ + flat2_out = __lsx_vmax_bu(p7_asub_p0, flat2_out); \ + flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out); \ + flat2_out = __lsx_vxori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ + } while (0) + +#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \ + p0_out, q0_out, q1_out) \ + do { \ + __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const __m128i cnst4b = __lsx_vldi(4); \ + const __m128i cnst3b = __lsx_vldi(3); \ + DUP4_ARG2(__lsx_vxori_b, p1_in, 0x80, p0_in, 0x80, q0_in, 0x80, q1_in, \ + 0x80, p1_m, p0_m, q0_m, q1_m); \ + filt = __lsx_vssub_b(p1_m, q1_m); \ + filt &= hev; \ + \ + q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt &= mask; \ + DUP2_ARG2(__lsx_vsadd_b, filt, cnst4b, filt, cnst3b, t1, t2); \ + DUP2_ARG2(__lsx_vsrai_b, t1, 3, t2, 3, t1, t2); \ + \ + q0_m = __lsx_vssub_b(q0_m, t1); \ + p0_m = __lsx_vsadd_b(p0_m, t2); \ + DUP2_ARG2(__lsx_vxori_b, q0_m, 0x80, p0_m, 0x80, q0_out, p0_out); \ + \ + filt = __lsx_vsrari_b(t1, 1); \ + hev = __lsx_vxori_b(hev, 0xff); \ + filt &= hev; \ + q1_m = __lsx_vssub_b(q1_m, filt); \ + p1_m = __lsx_vsadd_b(p1_m, filt); \ + DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out); \ + } while (0) + +#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ + q1_filt8_out, q2_filt8_out) \ + do { \ + __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ + \ + tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in); \ + tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, p0_in); \ + tmp_filt8_0 = __lsx_vslli_h(p3_in, 1); \ + \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_2); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, q0_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, p2_in); \ + p2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p1_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q1_in); \ + p1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vadd_h(q2_in, q1_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q0_in); \ + tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, tmp_filt8_1); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, p0_in); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, p3_in); \ + p0_filt8_out = __lsx_vsrari_h(tmp_filt8_0, 3); \ + \ + tmp_filt8_0 = __lsx_vadd_h(q2_in, q3_in); \ + tmp_filt8_0 = __lsx_vadd_h(p0_in, tmp_filt8_0); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \ + tmp_filt8_1 = __lsx_vadd_h(q3_in, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, tmp_filt8_0); \ + q2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, q0_in); \ + q0_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vsub_h(tmp_filt8_0, p2_in); \ + tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \ + q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c new file mode 100644 index 0000000000..77be0bb4fe --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs, + __m128i round, __m128i quant, + __m128i shift, __m128i cmp_mask) { + __m128i rounded, qcoeff; + + rounded = __lsx_vsadd_h(coeff_abs, round); + qcoeff = __lsx_vmuh_h(rounded, quant); + qcoeff = __lsx_vadd_h(rounded, qcoeff); + qcoeff = __lsx_vmuh_h(qcoeff, shift); + qcoeff = __lsx_vsigncov_h(coeff, qcoeff); + qcoeff = __lsx_vand_v(qcoeff, cmp_mask); + + return qcoeff; +} + +static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, + int16_t *dqcoeff) { + __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant); + __lsx_vst(dqcoeff16, dqcoeff, 0); +} + +static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff, + __m128i dequant, + int16_t *dqcoeff) { + // Un-sign to bias rounding like C. + __m128i low, high, dqcoeff32_0, dqcoeff32_1, res; + __m128i zero = __lsx_vldi(0); + __m128i coeff = __lsx_vabsd_h(qcoeff, zero); + + const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero); + const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero); + + low = __lsx_vmul_h(coeff, dequant); + high = __lsx_vmuh_h(coeff, dequant); + dqcoeff32_0 = __lsx_vilvl_h(high, low); + dqcoeff32_1 = __lsx_vilvh_h(high, low); + + // "Divide" by 2. + dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1); + dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1); + dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0); + dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1); + res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0); + __lsx_vst(res, dqcoeff, 0); +} + +static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, + const int16_t *scan, int index, + __m128i zero) { + const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); + const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero); + __m128i scan0 = __lsx_vld(scan + index, 0); + __m128i scan1 = __lsx_vld(scan + index + 8, 0); + __m128i eob0, eob1; + + eob0 = __lsx_vandn_v(zero_coeff0, scan0); + eob1 = __lsx_vandn_v(zero_coeff1, scan1); + return __lsx_vmax_h(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + int16_t res_m; + + eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); + eob = __lsx_vmax_h(eob, eob_shuffled); + eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); + eob = __lsx_vmax_h(eob, eob_shuffled); + eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); + eob = __lsx_vmax_h(eob, eob_shuffled); + res_m = __lsx_vpickve2gr_h(eob, 1); + + return res_m; +} + +#if !CONFIG_VP9_HIGHBITDEPTH +void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + __m128i zero = __lsx_vldi(0); + int index = 16; + + __m128i zbin, round, quant, dequant, quant_shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + (void)scan; + + zbin = __lsx_vld(zbin_ptr, 0); + round = __lsx_vld(round_ptr, 0); + quant = __lsx_vld(quant_ptr, 0); + dequant = __lsx_vld(dequant_ptr, 0); + quant_shift = __lsx_vld(quant_shift_ptr, 0); + // Handle one DC and first 15 AC. + DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + zbin = __lsx_vilvh_d(zbin, zbin); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + round = __lsx_vilvh_d(round, round); + quant = __lsx_vilvh_d(quant, quant); + quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + + __lsx_vst(qcoeff0, qcoeff_ptr, 0); + __lsx_vst(qcoeff1, qcoeff_ptr, 16); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); + dequant = __lsx_vilvh_d(dequant, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); + + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); + // AC only loop. + while (index < n_coeffs) { + coeff0 = __lsx_vld(coeff_ptr + index, 0); + coeff1 = __lsx_vld(coeff_ptr + index + 8, 0); + + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + + __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); + __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); + eob = __lsx_vmax_h(eob, eob0); + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zero = __lsx_vldi(0); + int index; + + __m128i zbin, round, quant, dequant, quant_shift; + __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1; + __m128i eob = zero, eob0; + + (void)scan; + (void)n_coeffs; + + zbin = __lsx_vld(zbin_ptr, 0); + zbin = __lsx_vsrari_h(zbin, 1); + round = __lsx_vld(round_ptr, 0); + round = __lsx_vsrari_h(round, 1); + + quant = __lsx_vld(quant_ptr, 0); + dequant = __lsx_vld(dequant_ptr, 0); + quant_shift = __lsx_vld(quant_shift_ptr, 0); + quant_shift = __lsx_vslli_h(quant_shift, 1); + // Handle one DC and first 15 AC. + DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + // remove DC from zbin + zbin = __lsx_vilvh_d(zbin, zbin); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + // remove DC in quant_shift, quant, quant_shift + round = __lsx_vilvh_d(round, round); + quant = __lsx_vilvh_d(quant, quant); + quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + __lsx_vst(qcoeff0, qcoeff_ptr, 0); + __lsx_vst(qcoeff1, qcoeff_ptr, 16); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr); + dequant = __lsx_vilvh_d(dequant, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = __lsx_vld(coeff_ptr + index, 0); + coeff1 = __lsx_vld(coeff_ptr + index + 8, 0); + + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); + __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, + dqcoeff_ptr + 8 + index); + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); + eob = __lsx_vmax_h(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c new file mode 100644 index 0000000000..b6fbedb0d0 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c @@ -0,0 +1,717 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0, + __m128i ref1) { + __m128i diff0_m, diff1_m, sad_m0; + __m128i sad_m = __lsx_vldi(0); + + diff0_m = __lsx_vabsd_bu(in0, ref0); + diff1_m = __lsx_vabsd_bu(in1, ref1); + + sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m); + sad_m = __lsx_vadd_h(sad_m, sad_m0); + sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m); + sad_m = __lsx_vadd_h(sad_m, sad_m0); + + return sad_m; +} + +static INLINE uint32_t hadd_uw_u32(__m128i in) { + __m128i res0_m; + uint32_t sum_m; + + res0_m = __lsx_vhaddw_du_wu(in, in); + res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m); + sum_m = __lsx_vpickve2gr_w(res0_m, 0); + + return sum_m; +} + +static INLINE uint32_t hadd_uh_u32(__m128i in) { + __m128i res_m; + uint32_t sum_m; + + res_m = __lsx_vhaddw_wu_hu(in, in); + sum_m = hadd_uw_u32(res_m); + + return sum_m; +} + +static INLINE int32_t hadd_sw_s32(__m128i in) { + __m128i res0_m; + int32_t sum_m; + + res0_m = __lsx_vhaddw_d_w(in, in); + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); + sum_m = __lsx_vpickve2gr_w(res0_m, 0); + + return sum_m; +} + +static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t res; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp; + __m128i sad = __lsx_vldi(0); + + for (ht_cnt = (height >> 2); ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3); + src += src_stride; + ref += ref_stride; + DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 2); + uint32_t res; + __m128i src0, src1, ref0, ref1, sad_tmp; + __m128i sad = __lsx_vldi(0); + int32_t src_stride2 = src_stride << 1; + int32_t ref_stride2 = ref_stride << 1; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); + src += src_stride2; + ref += ref_stride2; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); + src += src_stride2; + ref += ref_stride2; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 2); + uint32_t res; + __m128i src0, src1, ref0, ref1; + __m128i sad_tmp; + __m128i sad = __lsx_vldi(0); + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 1); + uint32_t sad = 0; + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + } + + sad = hadd_uh_u32(sad0); + sad += hadd_uh_u32(sad1); + + return sad; +} + +static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt = (height >> 2); + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + __m128i src0, src1, src2, src3, sad_tmp; + __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + src0 = __lsx_vld(src_ptr, 0); + DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_ptr, src_stride3); + src_ptr += src_stride4; + ref0 = __lsx_vld(ref0_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1, + ref2); + ref3 = __lsx_vldx(ref0_ptr, ref_stride3); + ref0_ptr += ref_stride4; + ref4 = __lsx_vld(ref1_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5, + ref6); + ref7 = __lsx_vldx(ref1_ptr, ref_stride3); + ref1_ptr += ref_stride4; + ref8 = __lsx_vld(ref2_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9, + ref10); + ref11 = __lsx_vldx(ref2_ptr, ref_stride3); + ref2_ptr += ref_stride4; + ref12 = __lsx_vld(ref3_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13, + ref14); + ref15 = __lsx_vldx(ref3_ptr, ref_stride3); + ref3_ptr += ref_stride4; + + DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1); + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); +} + +static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt = (height >> 1); + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref0 = __lsx_vld(ref0_ptr, 0); + ref0_ptr += ref_stride; + ref1 = __lsx_vld(ref1_ptr, 0); + ref1_ptr += ref_stride; + ref2 = __lsx_vld(ref2_ptr, 0); + ref2_ptr += ref_stride; + ref3 = __lsx_vld(ref3_ptr, 0); + ref3_ptr += ref_stride; + + diff = __lsx_vabsd_bu(src, ref0); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + diff = __lsx_vabsd_bu(src, ref1); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + diff = __lsx_vabsd_bu(src, ref2); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + diff = __lsx_vabsd_bu(src, ref3); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref0 = __lsx_vld(ref0_ptr, 0); + ref0_ptr += ref_stride; + ref1 = __lsx_vld(ref1_ptr, 0); + ref1_ptr += ref_stride; + ref2 = __lsx_vld(ref2_ptr, 0); + ref2_ptr += ref_stride; + ref3 = __lsx_vld(ref3_ptr, 0); + ref3_ptr += ref_stride; + + diff = __lsx_vabsd_bu(src, ref0); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + diff = __lsx_vabsd_bu(src, ref1); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + diff = __lsx_vabsd_bu(src, ref2); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + diff = __lsx_vabsd_bu(src, ref3); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); +} + +static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt = height; + __m128i src0, src1, ref0, ref1, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + + DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1); + ref0_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1); + ref1_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1); + ref2_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1); + ref3_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); +} + +static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt = height; + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i sad, sad_tmp; + + __m128i sad0_0 = __lsx_vldi(0); + __m128i sad0_1 = sad0_0; + __m128i sad1_0 = sad0_0; + __m128i sad1_1 = sad0_0; + __m128i sad2_0 = sad0_0; + __m128i sad2_1 = sad0_0; + __m128i sad3_0 = sad0_0; + __m128i sad3_1 = sad0_0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + + DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48, + ref0, ref1, ref2, ref3); + ref0_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48, + ref0, ref1, ref2, ref3); + ref1_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48, + ref0, ref1, ref2, ref3); + ref2_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48, + ref0, ref1, ref2, ref3); + ref3_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp); + } + sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[0] = hadd_uw_u32(sad); + + sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[1] = hadd_uw_u32(sad); + + sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[2] = hadd_uw_u32(sad); + + sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[3] = hadd_uw_u32(sad); +} + +static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t res, ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i comp0, comp1, sad_tmp; + __m128i sad = __lsx_vldi(0); + uint8_t *src_tmp, *ref_tmp; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + for (; ht_cnt--;) { + src_tmp = (uint8_t *)src + 16; + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src1 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + ref_tmp = (uint8_t *)ref + 16; + ref0 = __lsx_vld(ref, 0); + DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4); + ref6 = __lsx_vldx(ref, ref_stride3); + ref1 = __lsx_vld(ref_tmp, 0); + DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3, + ref5); + ref7 = __lsx_vldx(ref_tmp, ref_stride3); + ref += ref_stride4; + + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96, + pred0, pred2, pred4, pred6); + DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred, + 112, pred1, pred3, pred5, pred7); + sec_pred += 128; + + DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1); + sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1); + sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1); + sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t res, ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3; + __m128i sad, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + } + sad = __lsx_vhaddw_wu_hu(sad0, sad0); + sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1); + sad = __lsx_vadd_w(sad, sad_tmp); + + res = hadd_sw_s32(sad); + return res; +} + +#define VPX_SAD_8xHT_LSX(height) \ + uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_8width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_16xHT_LSX(height) \ + uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_16width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_32xHT_LSX(height) \ + uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_32width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_64xHT_LSX(height) \ + uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_64width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_8xHTx4D_LSX(height) \ + void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ + sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_16xHTx4D_LSX(height) \ + void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_32xHTx4D_LSX(height) \ + void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_64xHTx4D_LSX(height) \ + void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_AVGSAD_32xHT_LSX(height) \ + uint32_t vpx_sad32x##height##_avg_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define VPX_AVGSAD_64xHT_LSX(height) \ + uint32_t vpx_sad64x##height##_avg_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define SAD64 \ + VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \ + VPX_AVGSAD_64xHT_LSX(64) + +SAD64 + +#define SAD32 \ + VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \ + VPX_AVGSAD_32xHT_LSX(32) + +SAD32 + +#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16) + +SAD16 + +#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8) + +SAD8 + +#undef SAD64 +#undef SAD32 +#undef SAD16 +#undef SAD8 diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c new file mode 100644 index 0000000000..700793531c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c @@ -0,0 +1,874 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_dsp/loongarch/variance_lsx.h" +#include "vpx_dsp/variance.h" + +static const uint8_t bilinear_filters_lsx[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) + +static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t res, ht_cnt = 32; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i pred0, pred1, pred2, pred3, vec, vec_tmp; + __m128i avg0, avg1, avg2, avg3; + __m128i var = __lsx_vldi(0); + + avg0 = var; + avg1 = var; + avg2 = var; + avg3 = var; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3, + pred3, src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3, + pred3, src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + vec = __lsx_vhaddw_w_h(avg0, avg0); + vec_tmp = __lsx_vhaddw_w_h(avg1, avg1); + vec = __lsx_vadd_w(vec, vec_tmp); + vec_tmp = __lsx_vhaddw_w_h(avg2, avg2); + vec = __lsx_vadd_w(vec, vec_tmp); + vec_tmp = __lsx_vhaddw_w_h(avg3, avg3); + vec = __lsx_vadd_w(vec, vec_tmp); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t sub_pixel_sse_diff_8width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i vec0, vec1, vec2, vec3, filt0, out, vec; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec0, vec0, FILTER_BITS, vec1, vec1, + FILTER_BITS, vec2, vec2, FILTER_BITS, vec3, vec3, FILTER_BITS, + src0, src1, src2, src3); + out = __lsx_vpackev_d(src1, src0); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = __lsx_vpackev_d(src3, src2); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_16width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, filt0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i vec, var = __lsx_vldi(0); + __m128i avg = var; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, dst0, var, avg); + CALC_MSE_AVG_B(src1, dst1, var, avg); + CALC_MSE_AVG_B(src2, dst2, var, avg); + CALC_MSE_AVG_B(src3, dst3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_8width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4; + __m128i vec, vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3, filt0; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + src0 = src4; + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_16width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4; + __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i var = __lsx_vldi(0); + __m128i avg = var; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + src0 = src4; + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_8width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4, out0, out1; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3, vec, vec0, filt_hz, filt_vt; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src1, ref0); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src2, ref1); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src3, ref2); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src4, ref3); + src += src_stride; + dst += dst_stride; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out1); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out1); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out0); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_16width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec; + __m128i var = __lsx_vldi(0); + __m128i avg = var; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + CALC_MSE_AVG_B(src2, ref2, var, avg); + CALC_MSE_AVG_B(src3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t subpel_avg_ssediff_16w_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + __m128i pred0, pred1, pred2, pred3, filt0, vec; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i mask = { 0x403030202010100, 0x807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + dst0 = __lsx_vld(dst, 0); + dst += dst_stride; + dst1 = __lsx_vld(dst, 0); + dst += dst_stride; + dst2 = __lsx_vld(dst, 0); + dst += dst_stride; + dst3 = __lsx_vld(dst, 0); + dst += dst_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vavgr_bu, tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, + pred3, tmp0, tmp1, tmp2, tmp3); + + CALC_MSE_AVG_B(tmp0, dst0, var, avg); + CALC_MSE_AVG_B(tmp1, dst1, var, avg); + CALC_MSE_AVG_B(tmp2, dst2, var, avg); + CALC_MSE_AVG_B(tmp3, dst3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t subpel_avg_ssediff_16w_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3; + __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1, vec, filt0; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + src += src_stride; + src2 = __lsx_vld(src, 0); + src += src_stride; + src3 = __lsx_vld(src, 0); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + src0 = src4; + ref0 = __lsx_vld(dst, 0); + dst += dst_stride; + ref1 = __lsx_vld(dst, 0); + dst += dst_stride; + ref2 = __lsx_vld(dst, 0); + dst += dst_stride; + ref3 = __lsx_vld(dst, 0); + dst += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3, + pred3, out0, out1, out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t subpel_avg_ssediff_16w_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + __m128i out0, out1, out2, out3, filt_hz, filt_vt, vec, vec0, vec1; + __m128i mask = { 0x403030202010100, 0x807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + ref0 = __lsx_vld(dst, 0); + dst += dst_stride; + ref1 = __lsx_vld(dst, 0); + dst += dst_stride; + ref2 = __lsx_vld(dst, 0); + dst += dst_stride; + ref3 = __lsx_vld(dst, 0); + dst += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3, + pred3, out0, out1, out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_lsx(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_lsx(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_lsx(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6) +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8) +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12) + +#define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht) \ + uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx( \ + const uint8_t *src, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sse) { \ + int32_t diff; \ + uint32_t var; \ + const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \ + \ + if (y_offset) { \ + if (x_offset) { \ + *sse = sub_pixel_sse_diff_##wd##width_hv_lsx( \ + src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_sse_diff_##wd##width_v_lsx( \ + src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ + } \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + if (x_offset) { \ + *sse = sub_pixel_sse_diff_##wd##width_h_lsx( \ + src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + var = vpx_variance##wd##x##ht##_lsx(src, src_stride, ref, ref_stride, \ + sse); \ + } \ + } \ + \ + return var; \ + } + +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(8, 8) +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(16, 16) +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32) + +#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht) \ + uint32_t vpx_sub_pixel_avg_variance64x##ht##_lsx( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \ + \ + if (y_offset) { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_64width_hv_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_64width_v_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_64width_h_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_64x##ht##_lsx(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, &diff); \ + } \ + } \ + \ + return VARIANCE_64Wx##ht##H(*sse, diff); \ + } + +VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(64) diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c new file mode 100644 index 0000000000..943a5c5a9b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + __m128i src0, src1, src2, src3; + __m128i pred0, pred1, pred2, pred3; + __m128i diff0, diff1; + __m128i reg0, reg1; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t diff_stride2 = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t diff_stride3 = diff_stride2 + diff_stride; + + DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0, + pred1, pred2, pred3); + DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2, + src0, src2, pred0, pred2); + DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0); + reg0 = __lsx_vilvl_b(src0, pred0); + reg1 = __lsx_vilvh_b(src0, pred0); + DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1); + __lsx_vstelm_d(diff0, diff_ptr, 0, 0); + __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1); + __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0); + __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1); +} + +static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t dst_stride = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t dst_stride2 = dst_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0, + pred1, pred2, pred3); + src_ptr += src_stride4; + pred_ptr += pred_stride4; + + DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5, + src6, src7); + DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4, + pred5, pred6, pred7); + + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + __lsx_vst(src0, diff_ptr, 0); + __lsx_vstx(src1, diff_ptr, dst_stride); + __lsx_vstx(src2, diff_ptr, dst_stride2); + __lsx_vstx(src3, diff_ptr, dst_stride3); + diff_ptr += dst_stride2; + __lsx_vst(src4, diff_ptr, 0); + __lsx_vstx(src5, diff_ptr, dst_stride); + __lsx_vstx(src6, diff_ptr, dst_stride2); + __lsx_vstx(src7, diff_ptr, dst_stride3); +} + +static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t dst_stride = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t dst_stride2 = dst_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + int16_t *diff_tmp = diff + 8; + + DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred, + pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + pred, pred_stride, src5, src6, src7, pred5); + DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, + pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7, + pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vstx(src2, diff, dst_stride); + __lsx_vstx(src4, diff, dst_stride2); + __lsx_vstx(src6, diff, dst_stride3); + __lsx_vst(src1, diff_tmp, 0); + __lsx_vstx(src3, diff_tmp, dst_stride); + __lsx_vstx(src5, diff_tmp, dst_stride2); + __lsx_vstx(src7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + __lsx_vst(pred0, diff, 0); + __lsx_vstx(pred2, diff, dst_stride); + __lsx_vstx(pred4, diff, dst_stride2); + __lsx_vstx(pred6, diff, dst_stride3); + __lsx_vst(pred1, diff_tmp, 0); + __lsx_vstx(pred3, diff_tmp, dst_stride); + __lsx_vstx(pred5, diff_tmp, dst_stride2); + __lsx_vstx(pred7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred, + pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + pred, pred_stride, src5, src6, src7, pred5); + DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7); + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, + pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7, + pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vstx(src2, diff, dst_stride); + __lsx_vstx(src4, diff, dst_stride2); + __lsx_vstx(src6, diff, dst_stride3); + __lsx_vst(src1, diff_tmp, 0); + __lsx_vstx(src3, diff_tmp, dst_stride); + __lsx_vstx(src5, diff_tmp, dst_stride2); + __lsx_vstx(src7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + __lsx_vst(pred0, diff, 0); + __lsx_vstx(pred2, diff, dst_stride); + __lsx_vstx(pred4, diff, dst_stride2); + __lsx_vstx(pred6, diff, dst_stride3); + __lsx_vst(pred1, diff_tmp, 0); + __lsx_vstx(pred3, diff_tmp, dst_stride); + __lsx_vstx(pred5, diff_tmp, dst_stride2); + __lsx_vstx(pred7, diff_tmp, dst_stride3); +} + +static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + uint32_t loop_cnt; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + + for (loop_cnt = 8; loop_cnt--;) { + const uint8_t *src_tmp = src + 16; + const uint8_t *pred_tmp = pred + 16; + DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1, + pred0, pred1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src3, src4, src5); + DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred, + pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3); + DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred, + pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7); + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, + reg7, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, + tmp7, pred4, pred5, pred6, pred7); + src += src_stride4; + pred += pred_stride4; + __lsx_vst(src0, diff, 0); + __lsx_vst(src1, diff, 16); + __lsx_vst(src2, diff, 32); + __lsx_vst(src3, diff, 48); + diff += diff_stride; + __lsx_vst(src4, diff, 0); + __lsx_vst(src5, diff, 16); + __lsx_vst(src6, diff, 32); + __lsx_vst(src7, diff, 48); + diff += diff_stride; + __lsx_vst(pred0, diff, 0); + __lsx_vst(pred1, diff, 16); + __lsx_vst(pred2, diff, 32); + __lsx_vst(pred3, diff, 48); + diff += diff_stride; + __lsx_vst(pred4, diff, 0); + __lsx_vst(pred5, diff, 16); + __lsx_vst(pred6, diff, 32); + __lsx_vst(pred7, diff, 48); + diff += diff_stride; + } +} + +static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + uint32_t loop_cnt; + + for (loop_cnt = 32; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1, + pred2, pred3); + src += src_stride; + pred += pred_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6, + src7); + DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5, + pred6, pred7); + src += src_stride; + pred += pred_stride; + + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, + reg7, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, + tmp7, pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vst(src1, diff, 16); + __lsx_vst(src2, diff, 32); + __lsx_vst(src3, diff, 48); + __lsx_vst(src4, diff, 64); + __lsx_vst(src5, diff, 80); + __lsx_vst(src6, diff, 96); + __lsx_vst(src7, diff, 112); + diff += diff_stride; + __lsx_vst(pred0, diff, 0); + __lsx_vst(pred1, diff, 16); + __lsx_vst(pred2, diff, 32); + __lsx_vst(pred3, diff, 48); + __lsx_vst(pred4, diff, 64); + __lsx_vst(pred5, diff, 80); + __lsx_vst(pred6, diff, 96); + __lsx_vst(pred7, diff, 112); + diff += diff_stride; + } +} + +void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + if (rows == cols) { + switch (rows) { + case 4: + sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 8: + sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 16: + sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 32: + sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 64: + sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + default: + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } + } else { + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h new file mode 100644 index 0000000000..bd514831bf --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \ + __m128i k0_m, k1_m, k2_m, k3_m; \ + \ + k0_m = __lsx_vreplgr2vr_h(cnst0); \ + k1_m = __lsx_vreplgr2vr_h(cnst1); \ + k2_m = __lsx_vpackev_h(k1_m, k0_m); \ + \ + DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m); \ + DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m); \ + \ + DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \ + k3_m = __lsx_vmulwod_w_h(s5_m, k1_m); \ + s1_m = __lsx_vsub_w(s1_m, k3_m); \ + k3_m = __lsx_vmulwod_w_h(s4_m, k1_m); \ + s0_m = __lsx_vsub_w(s0_m, k3_m); \ + \ + out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + \ + DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m); \ + out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + } while (0) + +#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3) \ + do { \ + __m128i tp0_m, tp1_m; \ + \ + DUP2_ARG2(__lsx_vdp2_w_h, in0, in2, in1, in2, tp1_m, tp0_m); \ + in3 = __lsx_vssrarni_h_w(tp1_m, tp0_m, DCT_CONST_BITS); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c new file mode 100644 index 0000000000..8fad342c71 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/variance_lsx.h" + +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) + +static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t res, ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + src_ptr += src_stride4; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr + ref_stride, 0, + ref_ptr + ref_stride2, 0, ref_ptr + ref_stride3, 0, ref0, ref1, + ref2, ref3); + ref_ptr += ref_stride4; + + DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t res, ht_cnt = (height >> 2); + __m128i src, ref, vec; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + for (; ht_cnt--;) { + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t res, ht_cnt = (height >> 2); + __m128i avg = __lsx_vldi(0); + __m128i src0, src1, ref0, ref1; + __m128i vec; + __m128i var = avg; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t res, ht_cnt = 32; + __m128i avg0 = __lsx_vldi(0); + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i vec0, vec1; + __m128i avg1 = avg0; + __m128i avg2 = avg0; + __m128i avg3 = avg0; + __m128i var = avg0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + vec0 = __lsx_vhaddw_w_h(avg0, avg0); + vec1 = __lsx_vhaddw_w_h(avg1, avg1); + vec0 = __lsx_vadd_w(vec0, vec1); + vec1 = __lsx_vhaddw_w_h(avg2, avg2); + vec0 = __lsx_vadd_w(vec0, vec1); + vec1 = __lsx_vhaddw_w_h(avg3, avg3); + vec0 = __lsx_vadd_w(vec0, vec1); + HADD_SW_S32(vec0, *diff); + HADD_SW_S32(var, res); + return res; +} + +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6) +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8) + +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12) + +#define VPX_VARIANCE_WDXHT_LSX(wd, ht) \ + uint32_t vpx_variance##wd##x##ht##_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, uint32_t *sse) { \ + int32_t diff; \ + \ + *sse = \ + sse_diff_##wd##width_lsx(src, src_stride, ref, ref_stride, ht, &diff); \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +static uint32_t sse_16width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t res, ht_cnt = (height >> 2); + __m128i src, ref; + __m128i var = __lsx_vldi(0); + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + } + HADD_SW_S32(var, res); + return res; +} + +VPX_VARIANCE_WDXHT_LSX(8, 8) +VPX_VARIANCE_WDXHT_LSX(16, 16) +VPX_VARIANCE_WDXHT_LSX(32, 32) + +uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x64_lsx(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx64H(*sse, diff); +} + +uint32_t vpx_mse16x16_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_lsx(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_16width_lsx(src, src_stride, ref, ref_stride, 16, sum); +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h new file mode 100644 index 0000000000..cf9e9890ff --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define HADD_SW_S32(in0, in1) \ + do { \ + __m128i res0_m; \ + \ + res0_m = __lsx_vhaddw_d_w(in0, in0); \ + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ + in1 = __lsx_vpickve2gr_w(res0_m, 0); \ + } while (0) + +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \ + do { \ + __m128i tmp0_m, tmp1_m; \ + \ + tmp0_m = __lsx_vshuf_b(in1, in0, mask); \ + tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \ + in2 = __lsx_vsrari_h(tmp1_m, shift); \ + } while (0) + +#define CALC_MSE_B(src, ref, var) \ + do { \ + __m128i src_l0_m, src_l1_m; \ + __m128i res_l0_m, res_l1_m; \ + \ + src_l0_m = __lsx_vilvl_b(src, ref); \ + src_l1_m = __lsx_vilvh_b(src, ref); \ + DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ + res_l0_m, res_l1_m); \ + var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ + var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ + } while (0) + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + do { \ + __m128i src_l0_m, src_l1_m; \ + __m128i res_l0_m, res_l1_m; \ + \ + src_l0_m = __lsx_vilvl_b(src, ref); \ + src_l1_m = __lsx_vilvh_b(src, ref); \ + DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ + res_l0_m, res_l1_m); \ + var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ + var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ + sub = __lsx_vadd_h(sub, res_l0_m); \ + sub = __lsx_vadd_h(sub, res_l1_m); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c new file mode 100644 index 0000000000..1c59228813 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c @@ -0,0 +1,972 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1; + __m128i dst0, dst1, dst2, dst3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp0, tmp1); + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + dst0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 3); +} + +static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + tmp0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp0 = __lsx_vilvl_w(tmp1, tmp0); + tmp1 = __lsx_vilvl_w(tmp3, tmp2); + dst0 = __lsx_vilvl_d(tmp1, tmp0); + + tmp0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp3 = __lsx_vldrepl_w(dst_tmp, 0); + tmp0 = __lsx_vilvl_w(tmp1, tmp0); + tmp1 = __lsx_vilvl_w(tmp3, tmp2); + dst1 = __lsx_vilvl_d(tmp1, tmp0); + + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp0, tmp1); + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp2, tmp3); + DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7, + tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1); + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 3); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 3); +} + +static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + int32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, tmp0, + tmp1, tmp2, tmp3); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1); + __lsx_vstelm_d(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt = height >> 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + dst0 = __lsx_vld(dst_tmp, 0); + dst1 = __lsx_vldx(dst_tmp, dst_stride); + dst_tmp += dst_stride2; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, + mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, + mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, + mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2, + mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15); + DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3, + filter0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, + tmp11, filter2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2, + tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, + tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, + tmp7); + DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3); + DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3); + DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst1, dst, dst_stride); + dst += dst_stride2; + } +} + +static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, dst0, dst1; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, + mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, + mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, + mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2, + mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15); + DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3, + filter0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, + tmp11, filter2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2, + tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, + tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, + tmp7); + DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt = height; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3, dst0, dst1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2); + src3 = __lsx_vld(src, 56); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1); + __lsx_vst(out0, dst, 32); + __lsx_vst(out1, dst, 48); + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i dst0, dst1, dst2, dst3, vec0, vec1, filt0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec0, vec1); + vec0 = __lsx_vssrarni_bu_h(vec1, vec0, FILTER_BITS); + vec0 = __lsx_vavgr_bu(vec0, dst0); + __lsx_vstelm_w(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 3); +} + +static void common_hz_2t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i vec4, vec5, vec6, vec7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp1 = (uint8_t *)src + src_stride4; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src4 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp1, src_stride3); + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst2, dst1, dst4, dst3, dst1, dst2); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask, src7, src6, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec4, vec5, vec6, vec7); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0, + res1, res2, res3); + DUP2_ARG2(__lsx_vilvl_d, res1, res0, res3, res2, res0, res2); + DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res2, dst1, res0, res2); + + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(res2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 3); + dst += dst_stride; +} + +static void common_hz_2t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_and_aver_dst_8x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec1); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec1, dst1, vec0, vec1); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec1, dst, 0, 1); +} + +static void common_hz_2t_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + if (height == 16) { + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_hz_2t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2) - 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp1 = (uint8_t *)src + 8; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src_tmp1 += src_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, + res4, res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, res0, + res2, res4, res6); + dst0 = __lsx_vld(dst, 0); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res2 = __lsx_vavgr_bu(res2, dst0); + __lsx_vst(res2, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res6 = __lsx_vavgr_bu(res6, dst0); + __lsx_vst(res6, dst, 0); + dst += dst_stride; + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src_tmp1 += src_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, res4, res5, res6, res7); + + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, + res0, res2, res4, res6); + dst0 = __lsx_vld(dst, 0); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res2 = __lsx_vavgr_bu(res2, dst0); + __lsx_vst(res2, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res6 = __lsx_vavgr_bu(res6, dst0); + __lsx_vst(res6, dst, 0); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0, dst1; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 16, src, 24, src2, src3); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 16, src, 24, src6, src7); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, res4, res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, + res0, res2, res4, res6); + + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + res2 = __lsx_vavgr_bu(res2, dst1); + __lsx_vst(res2, dst, 16); + dst += dst_stride; + + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + res6 = __lsx_vavgr_bu(res6, dst1); + __lsx_vst(res6, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4, + src6); + src7 = __lsx_vld(src, 56); + DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out2, out4, out6); + + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst1, dst2, + dst3); + out0 = __lsx_vavgr_bu(out0, dst0); + __lsx_vst(out0, dst, 0); + out2 = __lsx_vavgr_bu(out2, dst1); + __lsx_vst(out2, dst, 16); + out4 = __lsx_vavgr_bu(out4, dst2); + __lsx_vst(out4, dst, 32); + out6 = __lsx_vavgr_bu(out6, dst3); + __lsx_vst(out6, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_x = filter[x0_q4]; + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2) { + switch (w) { + case 4: + common_hz_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 8: + common_hz_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 16: + common_hz_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + + case 32: + common_hz_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 64: + common_hz_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + default: + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 8: + common_hz_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 16: + common_hz_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 32: + common_hz_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 64: + common_hz_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + default: + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c new file mode 100644 index 0000000000..d1abf622ad --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c @@ -0,0 +1,737 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hv_8ht_8vt_and_aver_dst_4w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i out0, out1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3 - src_stride3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + tmp2 = __lsx_vpackev_b(tmp5, tmp4); + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src4 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src5 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3); + src2 = __lsx_vilvl_d(src3, src2); + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); + tmp4 = __lsx_vpackev_b(tmp3, tmp4); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vshuf_b(src1, tmp3, shuff); + src0 = __lsx_vpackev_b(src1, src0); + out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS); + out0 = __lsx_vxori_b(out0, 128); + out0 = __lsx_vavgr_bu(out0, src2); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + tmp5 = src1; + tmp0 = tmp2; + tmp1 = tmp4; + tmp2 = src0; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_8w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + __m128i out0, out1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3 - src_stride3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1, + tmp0, tmp1, tmp2, tmp4); + DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6); + + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp3 = __lsx_vpackev_b(src7, src6); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vpackev_b(src8, src7); + out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = __lsx_vpackev_b(src9, src8); + src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = __lsx_vpackev_b(src10, src9); + src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3, + FILTER_BITS, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + src5 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src7 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src8 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src9 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7); + DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + src6 = src10; + tmp0 = tmp2; + tmp1 = tmp3; + tmp2 = src1; + tmp4 = tmp6; + tmp5 = src0; + tmp6 = src2; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_16w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hv_8ht_8vt_and_aver_dst_32w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_64w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0, vec1; + __m128i dst0, dst1, dst2, dst3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + + dst0 = __lsx_vldrepl_w(dst, 0); + dst1 = __lsx_vldrepl_w(dst + dst_stride, 0); + dst2 = __lsx_vldrepl_w(dst + dst_stride2, 0); + dst3 = __lsx_vldrepl_w(dst + dst_stride3, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); +} + +static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += src_stride4; + + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz); + hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz); + hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz); + DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, + hz_out1, hz_out3); + hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff); + hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6); + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); + dst1 = __lsx_vilvl_w(dst2, dst1); + dst2 = __lsx_vilvl_w(dst4, dst3); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5, + hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, + filt_vt, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, res0, res1); + DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1); + + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(res1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 3); +} + +static void common_hv_2ht_2vt_and_aver_dst_4w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (height == 8) { + common_hv_2ht_2vt_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *dst_tmp = dst; + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec1 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec2 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec3 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *dst_tmp = dst; + + /* rearranging filter */ + mask = __lsx_vld(mc_filt_mask_arr, 0); + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( + src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint8_t *src_tmp1; + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride << 2; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src_tmp1 = (uint8_t *)(src + 8); + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst0); + __lsx_vst(tmp3, dst, 0); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst1); + __lsx_vstx(tmp3, dst, dst_stride); + + hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst2); + __lsx_vstx(tmp3, dst, dst_stride2); + + hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst3); + __lsx_vstx(tmp3, dst, dst_stride3); + dst += dst_stride4; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_32w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hv_2ht_2vt_and_aver_dst_64w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_hv_2ht_2vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 8: + common_hv_2ht_2vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 16: + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 32: + common_hv_2ht_2vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 64: + common_hv_2ht_2vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + default: + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 8: + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 16: + common_hv_8ht_8vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 32: + common_hv_8ht_8vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 64: + common_hv_8ht_8vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + default: + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c new file mode 100644 index 0000000000..5c6413df44 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c @@ -0,0 +1,918 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i reg0, reg1, reg2, reg3, reg4; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + src0 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src4 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5, + src6); + src_tmp0 += src_stride3; + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0, + tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5); + DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1); + reg2 = __lsx_vilvl_d(tmp5, tmp2); + DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1); + reg2 = __lsx_vxori_b(reg2, 128); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1); + src0 = __lsx_vilvl_d(src1, src0); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); + DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1, + filter2, filter3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + out0 = __lsx_vavgr_bu(out0, src0); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + reg0 = reg2; + reg1 = reg3; + reg2 = reg4; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1, out2, out3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src4 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5, + src6); + src_tmp0 += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1); + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1, + filter2, filter3); + out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1, + filter2, filter3); + out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + reg0 = reg2; + reg1 = tmp0; + reg2 = tmp2; + reg3 = reg5; + reg4 = tmp1; + reg5 = tmp3; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_16w_mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, int32_t width) { + uint8_t *src_tmp; + uint32_t cnt = width >> 4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; cnt--;) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_reg = dst; + + src_tmp = src_tmp0; + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg6, reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, + src7, src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + tmp2 = __lsx_vld(dst_reg, 0); + tmp3 = __lsx_vldx(dst_reg, dst_stride); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); + __lsx_vst(tmp0, dst_reg, 0); + __lsx_vstx(tmp1, dst_reg, dst_stride); + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + tmp2 = __lsx_vldx(dst_reg, dst_stride2); + tmp3 = __lsx_vldx(dst_reg, dst_stride3); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); + __lsx_vstx(tmp0, dst_reg, dst_stride2); + __lsx_vstx(tmp1, dst_reg, dst_stride3); + dst_reg += dst_stride4; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } + src_tmp0 += 16; + dst += 16; + } +} + +static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 16); +} + +static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 32); +} + +static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 64); +} + +static void common_vt_2t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + __m128i src10_r, src32_r, src21_r, src43_r; + __m128i tmp0, tmp1; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + src += src_stride; + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110, + src4332); + DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + out = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(out, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 3); + dst += dst_stride; +} + +static void common_vt_2t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; + __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + __m128i src2110, src4332, src6554, src8776, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src7 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src8 = __lsx_vld(src, 0); + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); + dst1 = __lsx_vilvl_w(dst2, dst1); + dst2 = __lsx_vilvl_w(dst4, dst3); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + DUP4_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src87_r, src76_r, src2110, src4332, src6554, src8776); + DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0, + src8776, filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 3); +} + +static void common_vt_2t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_vt_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_vt_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_and_aver_dst_8x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec1); + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); +} + +static void common_vt_2t_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i dst0, dst1, dst2, dst3, dst4, dst5; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7); + src8 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst5 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst3, dst2, dst5, dst4, dst2, dst3); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst2, tmp2, dst3, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + + src0 = src8; + } +} + +static void common_vt_2t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_vt_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_vt_2t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + uint8_t *src_tmp1; + uint8_t *dst_tmp1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp0, tmp1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + + src_tmp1 = src + 16; + src6 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src7, + src8); + src9 = __lsx_vldx(src_tmp1, src_stride3); + + dst_tmp1 = dst + 16; + dst4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2, dst5, + dst6); + dst7 = __lsx_vldx(dst_tmp1, dst_stride3); + src += src_stride4; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vstx(tmp0, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vstx(tmp0, dst, dst_stride2); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vstx(tmp0, dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst4); + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst5); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst6); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst7); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + dst += dst_stride; + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + int32_t src_stride2 = src_stride << 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *src_tmp1; + uint8_t *dst_tmp1; + __m128i src0, src1, src2, src3, src4, src5; + __m128i src6, src7, src8, src9, src10, src11, filt0; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1; + + filt0 = __lsx_vldrepl_h(filter, 0); + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6, + src9); + src += src_stride; + + for (; loop_cnt--;) { + src2 = __lsx_vldx(src, src_stride); + dst1 = __lsx_vldx(dst, dst_stride); + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7, + src10); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst2, dst4, + dst6); + src_tmp1 = (uint8_t *)src + 16; + src5 = __lsx_vldx(src_tmp1, src_stride); + src_tmp1 = src_tmp1 + 16; + src8 = __lsx_vldx(src_tmp1, src_stride); + src_tmp1 = src_tmp1 + 16; + src11 = __lsx_vldx(src_tmp1, src_stride); + + dst_tmp1 = dst + 16; + dst3 = __lsx_vldx(dst_tmp1, dst_stride); + dst_tmp1 = dst + 32; + dst5 = __lsx_vldx(dst_tmp1, dst_stride); + dst_tmp1 = dst + 48; + dst7 = __lsx_vldx(dst_tmp1, dst_stride); + src += src_stride2; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vstx(tmp0, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vst(tmp0, dst, 16); + + dst_tmp1 = dst + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst4); + __lsx_vst(tmp0, dst, 32); + + dst_tmp1 = dst_tmp1 + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst5); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst6); + __lsx_vst(tmp0, dst, 48); + + dst_tmp1 = dst_tmp1 + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst7); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + dst += dst_stride2; + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_vt_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 8: + common_vt_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 16: + common_vt_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 32: + common_vt_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 64: + common_vt_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 8: + common_vt_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 16: + common_vt_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + + break; + case 32: + common_vt_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 64: + common_vt_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c new file mode 100644 index 0000000000..2c6459a978 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c @@ -0,0 +1,814 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out, out0, out1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1); + out = __lsx_vssrarni_b_h(out1, out0, 7); + out = __lsx_vxori_b(out, 128); + __lsx_vstelm_w(out, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 3); +} + +static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1); + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 3); +} + +static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1, + out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); +} + +static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 1; + int32_t stride = src_stride << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + const uint8_t *_src = src + src_stride; + DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2); + DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + dst += dst_stride; + __lsx_vst(out1, dst, 0); + dst += dst_stride; + src += stride; + } +} + +static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + + dst += dst_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + int32_t loop_cnt = height; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2); + src3 = __lsx_vld(src, 56); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 32); + __lsx_vst(out1, dst, 48); + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, vec0, vec1, vec2, vec3, res0, res1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3, + FILTER_BITS, res0, res1); + + __lsx_vstelm_w(res0, dst, 0, 0); + __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i res0, res1, res2, res3, filt0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + uint8_t *src_tmp1 = src + src_stride4; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp1, src_stride3); + + DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask, + src7, src6, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec4, vec5, vec6, vec7); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0, + res1, res2, res3); + + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 1); + dst += dst_stride; + + __lsx_vstelm_w(res2, dst, 0, 0); + __lsx_vstelm_w(res2, dst + dst_stride, 0, 1); + __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i filt0, mask; + __m128i src0, src1, src2, src3; + __m128i vec0, vec1, vec2, vec3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec1); + + __lsx_vstelm_d(vec0, dst, 0, 0); + __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + __m128i filt0, mask; + __m128i src0, src1, src2, src3, out0, out1; + __m128i vec0, vec1, vec2, vec3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + if (height == 16) { + uint8_t *dst_tmp1 = dst + dst_stride4; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst_tmp1, 0, 0); + __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst_tmp1 + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst_tmp1 + dst_stride3, 0, 1); + } +} + +static void common_hz_2t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2) - 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *src_tmp1 = src + 8; + mask = __lsx_vld(mc_filt_mask_arr, 0); + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, mask, + src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, + out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0, + out1, out2, out3); + + __lsx_vst(out0, dst, 0); + dst += dst_stride; + __lsx_vst(out1, dst, 0); + dst += dst_stride; + __lsx_vst(out2, dst, 0); + dst += dst_stride; + __lsx_vst(out3, dst, 0); + dst += dst_stride; + + for (; loop_cnt--;) { + src_tmp1 += src_stride4; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + dst += dst_stride; + __lsx_vst(out1, dst, 0); + dst += dst_stride; + __lsx_vst(out2, dst, 0); + dst += dst_stride; + __lsx_vst(out3, dst, 0); + dst += dst_stride; + } +} + +static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src4, src6); + src7 = __lsx_vld(src, 24); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + dst += dst_stride; + + __lsx_vst(out2, dst, 0); + __lsx_vst(out3, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4, + src6); + src7 = __lsx_vld(src, 56); + DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + __lsx_vst(out2, dst, 32); + __lsx_vst(out3, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_x = filter[x0_q4]; + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + if (vpx_get_filter_taps(filter_x) == 2) { + switch (w) { + case 4: + common_hz_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 8: + common_hz_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 16: + common_hz_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 32: + common_hz_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 64: + common_hz_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + default: + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 8: + common_hz_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 16: + common_hz_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 32: + common_hz_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 64: + common_hz_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + default: + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c new file mode 100644 index 0000000000..9f5cd6cfe9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c @@ -0,0 +1,697 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i out0, out1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= (3 + 3 * src_stride); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + src5 = __lsx_vld(src, 0); + src += src_stride; + src6 = __lsx_vld(src, 0); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + tmp2 = __lsx_vpackev_b(tmp5, tmp4); + + for (; loop_cnt--;) { + LSX_LD_4(src, src_stride, src7, src8, src9, src10); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); + tmp4 = __lsx_vpackev_b(tmp3, tmp4); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vshuf_b(src1, tmp3, shuff); + src0 = __lsx_vpackev_b(src1, src0); + out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + tmp5 = src1; + tmp0 = tmp2; + tmp1 = tmp4; + tmp2 = src0; + } +} + +static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + __m128i out0, out1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= (3 + 3 * src_stride); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + src5 = __lsx_vld(src, 0); + src += src_stride; + src6 = __lsx_vld(src, 0); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1, + tmp0, tmp1, tmp2, tmp4); + DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6); + + for (; loop_cnt--;) { + LSX_LD_4(src, src_stride, src7, src8, src9, src10); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp3 = __lsx_vpackev_b(src7, src6); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vpackev_b(src8, src7); + out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = __lsx_vpackev_b(src9, src8); + src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = __lsx_vpackev_b(src10, src9); + src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + src6 = src10; + tmp0 = tmp2; + tmp1 = tmp3; + tmp2 = src1; + tmp4 = tmp6; + tmp5 = src0; + tmp6 = src2; + } +} + +static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; +} + +static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_vt, filt_hz, vec0, vec1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); + + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1, + FILTER_BITS, tmp0, tmp1); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += src_stride4; + + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz); + hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz); + hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz); + + DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, + hz_out1, hz_out3); + hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff); + hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6); + DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5, + hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, + filt_vt, vec4, vec5, vec6, vec7); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4, + vec5, vec6, vec7); + + __lsx_vstelm_w(vec4, dst, 0, 0); + __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1); + __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1); + dst += dst_stride4; + __lsx_vstelm_w(vec6, dst, 0, 0); + __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1); + __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else if (height == 8) { + common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } +} + +static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec1 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec2 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec3 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0; + __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3, + FILTER_BITS, tmp1, tmp2); + + __lsx_vstelm_d(tmp1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3, + FILTER_BITS, tmp1, tmp2); + + __lsx_vstelm_d(tmp1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else { + common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt_hz, filt_vt, vec0, vec1; + __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + + for (; loop_cnt--;) { + uint8_t *src_tmp0 = src + 8; + + DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src, + src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7); + src += src_stride4; + + hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); +} + +static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, + int32_t y_step_q4, int32_t w, int32_t h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 8: + common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 16: + common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 32: + common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 64: + common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + default: + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 8: + common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 16: + common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 32: + common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 64: + common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + default: + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c new file mode 100644 index 0000000000..6022e43c83 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c @@ -0,0 +1,825 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i reg0, reg1, reg2, reg3, reg4; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1; + uint8_t *_src = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0, + tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5); + DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1); + reg2 = __lsx_vilvl_d(tmp5, tmp2); + DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1); + reg2 = __lsx_vxori_b(reg2, 128); + + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); + DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1, + filter2, filter3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + reg0 = reg2; + reg1 = reg3; + reg2 = reg4; + src6 = src10; + } +} + +static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1, out2, out3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + src = src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9); + src10 = __lsx_vldx(src, src_stride3); + src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1, + filter2, filter3); + out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1, + filter2, filter3); + out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + reg0 = reg2; + reg1 = tmp0; + reg2 = tmp2; + reg3 = reg5; + reg4 = tmp1; + reg5 = tmp3; + src6 = src10; + } +} + +static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + // uint8_t *_src = (uint8_t *)src - src_stride3; + src -= src_stride3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, reg6, + reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9); + src10 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + __lsx_vst(tmp1, dst, 0); + dst += dst_stride; + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + __lsx_vst(tmp1, dst, 0); + dst += dst_stride; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } +} + +static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, + int32_t width) { + uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t cnt = width >> 4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + src -= src_stride3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; cnt--;) { + uint32_t loop_cnt = height >> 2; + + src_tmp = src; + dst_tmp = dst; + + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg6, reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, + src7, src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst_tmp, 0); + __lsx_vstx(tmp1, dst_tmp, dst_stride); + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstx(tmp0, dst_tmp, dst_stride2); + __lsx_vstx(tmp1, dst_tmp, dst_stride3); + dst_tmp += dst_stride4; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } + src += 16; + dst += 16; + } +} + +static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, + 32); +} + +static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, + 64); +} + +static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + __m128i filt0, tmp0, tmp1; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += (src_stride4 + src_stride); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); +} + +static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + __m128i vec6, vec7, vec8, vec9, vec10, vec11; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i filt0; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + uint8_t *dst_tmp1 = dst + dst_stride4; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += (src_stride4 + src_stride); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4, + vec5, vec6, vec7); + DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8, + vec9, vec10, vec11); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); + + __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3); +} + +static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_vt_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_vt_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + __m128i out0, out1, tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); +} + +static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i out0, out1, tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7) + src8 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + dst += dst_stride4; + + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + dst += dst_stride4; + + src0 = src8; + } +} + +static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_vt_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp, tmp0, tmp1; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + uint8_t *src_tmp; + + filt0 = __lsx_vldrepl_h(filter, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5); + src += src_stride; + src_tmp = src + 16; + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src1, src6); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src7, src3, src8); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src4, src9); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + src += src_stride4; + src_tmp += src_stride4; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride2); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + dst += dst_stride; + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + dst += dst_stride; + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + dst += dst_stride; + __lsx_vst(tmp, dst, 16); + + dst += dst_stride; + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp, tmp0, tmp1; + + int32_t src_stride2 = src_stride << 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *dst_tmp1 = dst + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6, + src9); + src += src_stride; + + for (; loop_cnt--;) { + uint8_t *src_tmp0 = src + src_stride; + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7, + src10); + DUP4_ARG2(__lsx_vld, src_tmp0, 0, src_tmp0, 16, src_tmp0, 32, src_tmp0, 48, + src2, src5, src8, src11); + src += src_stride2; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 0); + + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 16); + + DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 32); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 32); + + DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 48); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 48); + dst += dst_stride2; + dst_tmp1 += dst_stride2; + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 8; cnt--;) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_vt_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 8: + common_vt_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 16: + common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 32: + common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 64: + common_vt_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 8: + common_vt_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 16: + common_vt_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 32: + common_vt_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 64: + common_vt_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c new file mode 100644 index 0000000000..1dad29eeed --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void avg_width4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + __m128i src0, src1; + __m128i dst0, dst1; + + int32_t src_stride2 = src_stride << 1; + + if ((height % 2) == 0) { + for (cnt = (height / 2); cnt--;) { + src0 = __lsx_vld(src, 0); + src1 = __lsx_vldx(src, src_stride); + src += src_stride2; + + dst0 = __lsx_vld(dst, 0); + dst1 = __lsx_vldx(dst, dst_stride); + DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1); + + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 0); + dst += dst_stride; + } + } +} + +static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 4); + __m128i src0, src1, src2, src3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + for (; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + __lsx_vstelm_d(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst3, dst, 0, 0); + dst += dst_stride; + } +} + +static void avg_width16_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 8); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + for (; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src7 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + dst4 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst5, dst6); + dst7 = __lsx_vldx(dst, dst_stride3); + dst -= dst_stride4; + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst1, dst, dst_stride); + __lsx_vstx(dst2, dst, dst_stride2); + __lsx_vstx(dst3, dst, dst_stride3); + dst += dst_stride4; + __lsx_vst(dst4, dst, 0); + __lsx_vstx(dst5, dst, dst_stride); + __lsx_vstx(dst6, dst, dst_stride2); + __lsx_vstx(dst7, dst, dst_stride3); + dst += dst_stride4; + } +} + +static void avg_width32_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 8); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i src8, src9, src10, src11, src12, src13, src14, src15; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + for (; cnt--;) { + uint8_t *dst_tmp = dst; + uint8_t *dst_tmp1 = dst_tmp + 16; + uint8_t *src_tmp = src + 16; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src0, src1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src3, src4, src5); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src6, src7); + src += src_stride4; + + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst0, dst1); + DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp, + dst_stride2, dst_tmp1, dst_stride2, dst2, dst3, dst4, dst5); + DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst6, + dst7); + dst_tmp += dst_stride4; + dst_tmp1 += dst_stride4; + + src_tmp = src + 16; + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src8, src9); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src10, src11, src12, src13); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src14, src15); + src += src_stride4; + + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst8, dst9); + DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp, + dst_stride2, dst_tmp1, dst_stride2, dst10, dst11, dst12, dst13); + DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst14, + dst15); + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11, + dst11, dst8, dst9, dst10, dst11); + DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15, + dst15, dst12, dst13, dst14, dst15); + + dst_tmp = dst + 16; + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst2, dst, dst_stride); + __lsx_vstx(dst4, dst, dst_stride2); + __lsx_vstx(dst6, dst, dst_stride3); + __lsx_vst(dst1, dst_tmp, 0); + __lsx_vstx(dst3, dst_tmp, dst_stride); + __lsx_vstx(dst5, dst_tmp, dst_stride2); + __lsx_vstx(dst7, dst_tmp, dst_stride3); + dst += dst_stride4; + + __lsx_vst(dst8, dst, 0); + __lsx_vstx(dst10, dst, dst_stride); + __lsx_vstx(dst12, dst, dst_stride2); + __lsx_vstx(dst14, dst, dst_stride3); + __lsx_vst(dst9, dst_tmp1, 0); + __lsx_vstx(dst11, dst_tmp1, dst_stride); + __lsx_vstx(dst13, dst_tmp1, dst_stride2); + __lsx_vstx(dst15, dst_tmp1, dst_stride3); + dst += dst_stride4; + } +} + +static void avg_width64_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 4); + uint8_t *dst_tmp = dst; + + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i src8, src9, src10, src11, src12, src13, src14, src15; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (; cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6, + src7); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src8, src9, src10, + src11); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src12, src13, src14, + src15); + src += src_stride; + + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst0, dst1, dst2, dst3); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst4, dst5, dst6, dst7); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst8, dst9, dst10, dst11); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst12, dst13, dst14, dst15); + dst_tmp += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11, + dst11, dst8, dst9, dst10, dst11); + DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15, + dst15, dst12, dst13, dst14, dst15); + + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + __lsx_vst(dst2, dst, 32); + __lsx_vst(dst3, dst, 48); + dst += dst_stride; + __lsx_vst(dst4, dst, 0); + __lsx_vst(dst5, dst, 16); + __lsx_vst(dst6, dst, 32); + __lsx_vst(dst7, dst, 48); + dst += dst_stride; + __lsx_vst(dst8, dst, 0); + __lsx_vst(dst9, dst, 16); + __lsx_vst(dst10, dst, 32); + __lsx_vst(dst11, dst, 48); + dst += dst_stride; + __lsx_vst(dst12, dst, 0); + __lsx_vst(dst13, dst, 16); + __lsx_vst(dst14, dst, 32); + __lsx_vst(dst15, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + switch (w) { + case 4: { + avg_width4_lsx(src, src_stride, dst, dst_stride, h); + break; + } + + case 8: { + avg_width8_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + avg_width16_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_width32_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_width64_lsx(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int32_t lp, cnt; + for (cnt = h; cnt--;) { + for (lp = 0; lp < w; ++lp) { + dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1); + } + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c new file mode 100644 index 0000000000..53dc7097ed --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <string.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void copy_width8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + + __lsx_vstelm_d(src4, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src5, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src6, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src7, dst, 0, 0); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 8) == 0) { + for (cnt = height >> 3; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + + __lsx_vstelm_d(src4, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src5, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src6, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src7, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 4) == 0) { + for (cnt = (height / 4); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 2) == 0) { + for (cnt = (height / 2); cnt--;) { + src0 = __lsx_vld(src, 0); + src1 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) { + int32_t cnt, loop_cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = (uint8_t *)src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + src0 = __lsx_vld(src_tmp, 0); + DUP4_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src_tmp, + src_stride3, src_tmp, src_stride4, src1, src2, src3, src4); + src_tmp += src_stride4; + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride2; + src7 = __lsx_vldx(src_tmp, src_stride); + src_tmp += src_stride2; + + __lsx_vst(src0, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src1, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src2, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src3, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + src += 16; + dst += 16; + } +} + +static void copy_width16_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + __lsx_vst(src4, dst, 0); + dst += dst_stride; + __lsx_vst(src5, dst, 0); + dst += dst_stride; + __lsx_vst(src6, dst, 0); + dst += dst_stride; + __lsx_vst(src7, dst, 0); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + } + } else if ((height % 8) == 0) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 16); + } else if ((height % 4) == 0) { + for (cnt = (height >> 2); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + } + } +} + +static void copy_width32_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + } else if ((height % 8) == 0) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 32); + } else if ((height % 4) == 0) { + for (cnt = (height >> 2); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + } +} + +static void copy_width64_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 64); +} + +void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + switch (w) { + case 4: { + uint32_t cnt; + __m128i tmp; + for (cnt = h; cnt--;) { + tmp = __lsx_vldrepl_w(src, 0); + __lsx_vstelm_w(tmp, dst, 0, 0); + src += src_stride; + dst += dst_stride; + } + break; + } + case 8: { + copy_width8_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + copy_width16_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_width32_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_width64_lsx(src, src_stride, dst, dst_stride, h); + break; + } + default: { + uint32_t cnt; + for (cnt = h; cnt--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h new file mode 100644 index 0000000000..d886b00198 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ + +#include "./vpx_config.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1, + __m128i _reg2, __m128i _reg3, + __m128i _filter0, __m128i _filter1, + __m128i _filter2, __m128i _filter3) { + __m128i _vec0, _vec1; + + _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); + _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); + _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); + _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); + return __lsx_vsadd_h(_vec0, _vec1); +} + +static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1, + __m128i _mask0, __m128i _mask1, + __m128i _mask2, __m128i _mask3, + __m128i _filt_h0, __m128i _filt_h1, + __m128i _filt_h2, __m128i _filt_h3) { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + __m128i _out; + + DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1, + _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3); + _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, + _filt_h2, _filt_h3); + _out = __lsx_vsrari_h(_out, FILTER_BITS); + return __lsx_vsat_h(_out, 7); +} + +static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask, + __m128i coeff) { + __m128i tmp0_m, tmp1_m; + + tmp0_m = __lsx_vshuf_b(in1, in0, mask); + tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); + return __lsx_vsrari_h(tmp1_m, FILTER_BITS); +} + +#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \ + do { \ + _src0 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src1 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src2 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src3 = __lsx_vld(_src, 0); \ + } while (0) + +#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \ + _mask2, _mask3, _filter0, _filter1, \ + _filter2, _filter3, _out0, _out1) \ + do { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ + __m128i _reg0, _reg1, _reg2, _reg3; \ + \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0, \ + _tmp0, _tmp1); \ + DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1, \ + _tmp2, _tmp3); \ + DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3, \ + _filter1, _reg0, _reg1); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2, \ + _tmp4, _tmp5); \ + DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3, \ + _tmp6, _tmp7); \ + DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \ + _filter3, _reg2, _reg3); \ + DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \ + } while (0) + +#define HORIZ_8TAP_8WID_4VECS_FILT( \ + _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, \ + _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3) \ + do { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ + __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \ + \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, \ + _src2, _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, \ + _tmp3); \ + DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2, \ + _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, \ + _src2, _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, \ + _tmp3); \ + DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2, \ + _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, \ + _src2, _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, \ + _tmp7); \ + DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5, \ + _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \ + _reg1, _reg2, _reg3); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, \ + _src2, _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, \ + _tmp7); \ + DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5, \ + _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \ + _reg5, _reg6, _reg7); \ + DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \ + _reg7, _out0, _out1, _out2, _out3); \ + } while (0) + +#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \ + do { \ + __m128i tmp0_m, tmp1_m; \ + \ + DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ |