summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vpx_dsp/loongarch
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vpx_dsp/loongarch')
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c90
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c83
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h41
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c1176
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c350
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h381
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c834
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c98
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c1320
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c214
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c458
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h167
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c248
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c717
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c874
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c371
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h48
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c263
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h62
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c972
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c737
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c918
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c814
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c697
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c825
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c321
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c437
-rw-r--r--media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h138
28 files changed, 13654 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c
new file mode 100644
index 0000000000..750c9de29f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h"
+
+void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride,
+ tran_low_t *dst) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ ptrdiff_t src_stride2 = src_stride << 1;
+ ptrdiff_t src_stride3 = src_stride2 + src_stride;
+ ptrdiff_t src_stride4 = src_stride2 << 1;
+ ptrdiff_t src_stride6 = src_stride3 << 1;
+
+ int16_t *src_tmp = (int16_t *)src;
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride6);
+ src_tmp += src_stride4;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6);
+ src7 = __lsx_vldx(src_tmp, src_stride6);
+
+ LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+ tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+ LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+ src4, src5, src7, src6, src3, src2);
+ LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+ tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+ LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+ tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+ LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+ src4, src5, src7, src6, src3, src2);
+ LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+ tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+ store_tran_low(tmp0, dst, 0);
+ store_tran_low(tmp1, dst, 8);
+ store_tran_low(tmp2, dst, 16);
+ store_tran_low(tmp3, dst, 24);
+ store_tran_low(tmp4, dst, 32);
+ store_tran_low(tmp5, dst, 40);
+ store_tran_low(tmp6, dst, 48);
+ store_tran_low(tmp7, dst, 56);
+}
+
+void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride,
+ tran_low_t *dst) {
+ int i;
+ __m128i a0, a1, a2, a3, b0, b1, b2, b3;
+
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0);
+ /* Top right. */
+ vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64);
+ /* Bottom left. */
+ vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128);
+ /* Bottom right. */
+ vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192);
+
+ for (i = 0; i < 64; i += 8) {
+ a0 = load_tran_low(dst);
+ a1 = load_tran_low(dst + 64);
+ a2 = load_tran_low(dst + 128);
+ a3 = load_tran_low(dst + 192);
+
+ LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1);
+ DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3);
+ LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2);
+
+ store_tran_low(a0, dst, 0);
+ store_tran_low(a1, dst, 64);
+ store_tran_low(a2, dst, 128);
+ store_tran_low(a3, dst, 192);
+
+ dst += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c
new file mode 100644
index 0000000000..482626080a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ // width > 8 || width == 8 || width == 4
+ if (width > 8) {
+ int i, j;
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 16) {
+ __m128i p, r, avg;
+
+ p = __lsx_vld(pred + j, 0);
+ r = __lsx_vld(ref + j, 0);
+ avg = __lsx_vavgr_bu(p, r);
+ __lsx_vst(avg, comp_pred + j, 0);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else if (width == 8) {
+ int i = height * width;
+ do {
+ __m128i p, r, r_0, r_1;
+
+ p = __lsx_vld(pred, 0);
+ r_0 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ r_1 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ r = __lsx_vilvl_d(r_1, r_0);
+ r = __lsx_vavgr_bu(p, r);
+
+ __lsx_vst(r, comp_pred, 0);
+
+ pred += 16;
+ comp_pred += 16;
+ i -= 16;
+ } while (i);
+ } else { // width = 4
+ int i = height * width;
+ assert(width == 4);
+ do {
+ __m128i p, r, r_0, r_1, r_2, r_3;
+ p = __lsx_vld(pred, 0);
+
+ if (width == ref_stride) {
+ r = __lsx_vld(ref, 0);
+ ref += 16;
+ } else {
+ r_0 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ r_1 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ r_2 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ r_3 = __lsx_vld(ref, 0);
+ ref += ref_stride;
+ DUP2_ARG2(__lsx_vilvl_w, r_1, r_0, r_3, r_2, r_0, r_2);
+ r = __lsx_vilvl_d(r_2, r_0);
+ }
+ r = __lsx_vavgr_bu(p, r);
+
+ __lsx_vst(r, comp_pred, 0);
+ comp_pred += 16;
+ pred += 16;
+ i -= 16;
+ } while (i);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
new file mode 100644
index 0000000000..b0db1e99c5
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i load_tran_low(const tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ __m128i v0_m = __lsx_vld(s, 0);
+ __m128i v1_m = __lsx_vld(s + 4, 0);
+ return __lsx_vsrlni_h_w(v0_m, v1_m, 0);
+#else
+ return __lsx_vld(s, 0);
+#endif
+}
+
+static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ __m128i v0_m, v1_m;
+ v1_m = __lsx_vexth_w_h(v);
+ v0_m = __lsx_vsllwil_w_h(v, 0);
+ __lsx_vst(v0_m, s + c, 0);
+ __lsx_vst(v1_m, s + c + 4, 0);
+#else
+ __lsx_vst(v, s + c, 0);
+#endif
+}
+
+#endif // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
new file mode 100644
index 0000000000..9bb3877212
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
@@ -0,0 +1,1176 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+#include "vpx_dsp/fwd_txfm.h"
+
+#define UNPCK_SH_SW(in, out0, out1) \
+ do { \
+ out0 = __lsx_vsllwil_w_h(in, 0); \
+ out1 = __lsx_vexth_w_h(in); \
+ } while (0)
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+ int32_t src_stride,
+ int16_t *temp_buff) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i step0, step1, step2, step3;
+ __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+ __m128i step0_1, step1_1, step2_1, step3_1;
+
+ int32_t stride = src_stride << 1;
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ const int16_t *input_tmp = (int16_t *)input;
+
+ in0 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+ in3 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp += stride2;
+ in0_1 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+ in3_1 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp = input + (src_stride * 24);
+ in4_1 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+ in7_1 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp += stride2;
+ in4 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+ in7 = __lsx_vldx(input_tmp, stride3);
+
+ DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+ in2_1, in3_1);
+ DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+ in6_1, in7_1);
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+ step3, in4, in5, in6, in7);
+ LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+ in7_1);
+
+ __lsx_vst(step0, temp_buff, 0);
+ __lsx_vst(step1, temp_buff, 16);
+ __lsx_vst(step2, temp_buff, 32);
+ __lsx_vst(step3, temp_buff, 48);
+
+ __lsx_vst(in4, temp_buff, 448);
+ __lsx_vst(in5, temp_buff, 464);
+ __lsx_vst(in6, temp_buff, 480);
+ __lsx_vst(in7, temp_buff, 496);
+
+ __lsx_vst(step0_1, temp_buff, 64);
+ __lsx_vst(step1_1, temp_buff, 80);
+ __lsx_vst(step2_1, temp_buff, 96);
+ __lsx_vst(step3_1, temp_buff, 112);
+
+ __lsx_vst(in4_1, temp_buff, 384);
+ __lsx_vst(in5_1, temp_buff, 400);
+ __lsx_vst(in6_1, temp_buff, 416);
+ __lsx_vst(in7_1, temp_buff, 432);
+
+ /* 3rd and 4th set */
+ input_tmp = input + (src_stride * 8);
+ in0 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+ in3 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp += stride2;
+ in0_1 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+ in3_1 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp += stride2;
+ in4_1 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+ in7_1 = __lsx_vldx(input_tmp, stride3);
+
+ input_tmp += stride2;
+ in4 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+ in7 = __lsx_vldx(input_tmp, stride3);
+ DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+ in2_1, in3_1);
+ DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+ in6_1, in7_1);
+
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+ step3, in4, in5, in6, in7);
+ LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+ in7_1);
+
+ __lsx_vst(step0, temp_buff, 128);
+ __lsx_vst(step1, temp_buff, 144);
+ __lsx_vst(step2, temp_buff, 160);
+ __lsx_vst(step3, temp_buff, 176);
+
+ __lsx_vst(in4, temp_buff, 320);
+ __lsx_vst(in5, temp_buff, 336);
+ __lsx_vst(in6, temp_buff, 352);
+ __lsx_vst(in7, temp_buff, 368);
+
+ __lsx_vst(step0_1, temp_buff, 192);
+ __lsx_vst(step1_1, temp_buff, 208);
+ __lsx_vst(step2_1, temp_buff, 224);
+ __lsx_vst(step3_1, temp_buff, 240);
+
+ __lsx_vst(in4_1, temp_buff, 256);
+ __lsx_vst(in5_1, temp_buff, 272);
+ __lsx_vst(in6_1, temp_buff, 288);
+ __lsx_vst(in7_1, temp_buff, 304);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i temp0, temp1;
+
+ /* fdct even */
+ DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+ in3);
+ DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+ in13, in14, in15);
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1,
+ vec2, vec3, in12, in13, in14, in15);
+ DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+ in6, in7);
+ DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+ in10, in11);
+ LSX_BUTTERFLY_8_H(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6,
+ vec7, in8, in9, in10, in11);
+
+ /* Stage 3 */
+ DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+ in1, in2, in3);
+ LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+ DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 0);
+ __lsx_vst(temp1, temp, 1024);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 512);
+ __lsx_vst(temp1, temp, 1536);
+
+ DUP4_ARG2(__lsx_vsub_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7,
+ vec6, vec5, vec4);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 256);
+ __lsx_vst(temp1, temp, 1792);
+
+ DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 1280);
+ __lsx_vst(temp1, temp, 768);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+ vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 128);
+ __lsx_vst(temp1, temp, 1920);
+
+ DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 1152);
+ __lsx_vst(temp1, temp, 896);
+
+ DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+ temp0 = __lsx_vneg_h(vec2);
+ DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+ DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+ vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 640);
+ __lsx_vst(temp1, temp, 1408);
+
+ DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ __lsx_vst(temp0, temp, 384);
+ __lsx_vst(temp1, temp, 1664);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+ __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+ __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+ __m128i tmp0, tmp1;
+
+ DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 160, input, 176, in20, in21,
+ in26, in27);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 192, input, 208, in18, in19,
+ in28, in29);
+
+ vec4 = __lsx_vsub_h(in19, in20);
+ __lsx_vst(vec4, input, 64);
+ vec4 = __lsx_vsub_h(in18, in21);
+ __lsx_vst(vec4, input, 80);
+ vec4 = __lsx_vsub_h(in29, in26);
+ __lsx_vst(vec4, input, 160);
+ vec4 = __lsx_vsub_h(in28, in27);
+ __lsx_vst(vec4, input, 176);
+
+ in21 = __lsx_vadd_h(in18, in21);
+ in20 = __lsx_vadd_h(in19, in20);
+ in27 = __lsx_vadd_h(in28, in27);
+ in26 = __lsx_vadd_h(in29, in26);
+
+ DUP4_ARG2(__lsx_vld, input, 96, input, 112, input, 128, input, 144, in22,
+ in23, in24, in25);
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+ DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 224, input, 240, in16, in17,
+ in30, in31);
+
+ vec4 = __lsx_vsub_h(in17, in22);
+ __lsx_vst(vec4, input, 32);
+ vec4 = __lsx_vsub_h(in16, in23);
+ __lsx_vst(vec4, input, 48);
+ vec4 = __lsx_vsub_h(in31, in24);
+ __lsx_vst(vec4, input, 192);
+ vec4 = __lsx_vsub_h(in30, in25);
+ __lsx_vst(vec4, input, 208);
+
+ DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+ in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+ DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+ in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 0);
+ __lsx_vst(vec4, temp_ptr, 1920);
+
+ DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 896);
+ __lsx_vst(vec4, temp_ptr, 1024);
+
+ DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+ in26, in24, in20);
+ tmp0 = __lsx_vneg_h(in23);
+ DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+ DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec4, temp_ptr, 1408);
+ __lsx_vst(vec5, temp_ptr, 512);
+
+ DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec4, temp_ptr, 384);
+ __lsx_vst(vec5, temp_ptr, 1536);
+
+ DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 64, input, 80, in22, in23,
+ in20, in21);
+ DUP4_ARG2(__lsx_vld, input, 160, input, 176, input, 192, input, 208, in26,
+ in27, in24, in25);
+ in16 = in20;
+ in17 = in21;
+ DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+ DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+ DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+ in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 1664);
+ __lsx_vst(vec4, temp_ptr, 256);
+
+ DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 640);
+ __lsx_vst(vec4, temp_ptr, 1280);
+
+ DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+ in29, in30, in19);
+ tmp0 = __lsx_vneg_h(in16);
+ DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+ DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 1152);
+ __lsx_vst(vec4, temp_ptr, 768);
+
+ DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ __lsx_vst(vec5, temp_ptr, 128);
+ __lsx_vst(vec4, temp_ptr, 1792);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+ int16_t *tmp_buf, int16_t *tmp_buf_big) {
+ fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+ fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+ fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+ int16_t *output) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i step0, step1, step2, step3, step4, step5, step6, step7;
+
+ DUP4_ARG2(__lsx_vld, temp_buff, 0, temp_buff, 64, temp_buff, 128, temp_buff,
+ 192, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, temp_buff, 256, temp_buff, 320, temp_buff, 384,
+ temp_buff, 448, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vld, temp_buff, 48, temp_buff, 112, temp_buff, 176, temp_buff,
+ 240, in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, temp_buff, 304, temp_buff, 368, temp_buff, 432,
+ temp_buff, 496, in12, in13, in14, in15);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, step0, step1, step2, step3,
+ step4, step5, step6, step7, in8, in9, in10, in11, in12,
+ in13, in14, in15);
+
+ __lsx_vst(step0, output, 0);
+ __lsx_vst(step1, output, 16);
+ __lsx_vst(step2, output, 32);
+ __lsx_vst(step3, output, 48);
+ __lsx_vst(step4, output, 64);
+ __lsx_vst(step5, output, 80);
+ __lsx_vst(step6, output, 96);
+ __lsx_vst(step7, output, 112);
+
+ __lsx_vst(in8, output, 384);
+ __lsx_vst(in9, output, 400);
+ __lsx_vst(in10, output, 416);
+ __lsx_vst(in11, output, 432);
+ __lsx_vst(in12, output, 448);
+ __lsx_vst(in13, output, 464);
+ __lsx_vst(in14, output, 480);
+ __lsx_vst(in15, output, 496);
+
+ /* 2nd set */
+ DUP4_ARG2(__lsx_vld, temp_buff, 16, temp_buff, 80, temp_buff, 144, temp_buff,
+ 208, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, temp_buff, 272, temp_buff, 336, temp_buff, 400,
+ temp_buff, 464, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vld, temp_buff, 32, temp_buff, 96, temp_buff, 160, temp_buff,
+ 224, in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, temp_buff, 288, temp_buff, 352, temp_buff, 416,
+ temp_buff, 480, in12, in13, in14, in15);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, step0, step1, step2, step3,
+ step4, step5, step6, step7, in8, in9, in10, in11, in12,
+ in13, in14, in15);
+
+ __lsx_vst(step0, output, 128);
+ __lsx_vst(step1, output, 144);
+ __lsx_vst(step2, output, 160);
+ __lsx_vst(step3, output, 176);
+ __lsx_vst(step4, output, 192);
+ __lsx_vst(step5, output, 208);
+ __lsx_vst(step6, output, 224);
+ __lsx_vst(step7, output, 240);
+
+ __lsx_vst(in8, output, 256);
+ __lsx_vst(in9, output, 272);
+ __lsx_vst(in10, output, 288);
+ __lsx_vst(in11, output, 304);
+ __lsx_vst(in12, output, 320);
+ __lsx_vst(in13, output, 336);
+ __lsx_vst(in14, output, 352);
+ __lsx_vst(in15, output, 368);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+ int16_t *out) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+ __m128i vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+ __m128i tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+ /* fdct32 even */
+ /* stage 2 */
+ DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+ in3);
+ DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+ in6, in7);
+ DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+ in10, in11);
+ DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+ in13, in14, in15);
+
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+ vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+ in15);
+
+ __lsx_vst(vec0, interm_ptr, 0);
+ __lsx_vst(vec1, interm_ptr, 16);
+ __lsx_vst(vec2, interm_ptr, 32);
+ __lsx_vst(vec3, interm_ptr, 48);
+ __lsx_vst(vec4, interm_ptr, 64);
+ __lsx_vst(vec5, interm_ptr, 80);
+ __lsx_vst(vec6, interm_ptr, 96);
+ __lsx_vst(vec7, interm_ptr, 112);
+
+ __lsx_vst(in8, interm_ptr, 128);
+ __lsx_vst(in9, interm_ptr, 144);
+ __lsx_vst(in10, interm_ptr, 160);
+ __lsx_vst(in11, interm_ptr, 176);
+ __lsx_vst(in12, interm_ptr, 192);
+ __lsx_vst(in13, interm_ptr, 208);
+ __lsx_vst(in14, interm_ptr, 224);
+ __lsx_vst(in15, interm_ptr, 240);
+
+ /* Stage 3 */
+ UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+ UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+ UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+ UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+ UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+ UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+ UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+ UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+ DUP4_ARG2(__lsx_vadd_w, vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r,
+ vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+ LSX_BUTTERFLY_4_W(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r,
+ vec5_r);
+ DUP4_ARG2(__lsx_vadd_w, vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l,
+ vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r);
+
+ tmp3_w = __lsx_vadd_w(vec0_r, vec3_r);
+ vec0_r = __lsx_vsub_w(vec0_r, vec3_r);
+ vec3_r = __lsx_vadd_w(vec1_r, vec2_r);
+ vec1_r = __lsx_vsub_w(vec1_r, vec2_r);
+
+ DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
+ vec4_r, tmp3_w, vec6_r, vec3_r);
+ FDCT32_POSTPROC_NEG_W(vec4_r);
+ FDCT32_POSTPROC_NEG_W(tmp3_w);
+ FDCT32_POSTPROC_NEG_W(vec6_r);
+ FDCT32_POSTPROC_NEG_W(vec3_r);
+ DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+ __lsx_vst(vec5, out, 0);
+ __lsx_vst(vec4, out, 16);
+
+ DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
+ vec4_r, tmp3_w, vec6_r, vec3_r);
+ FDCT32_POSTPROC_NEG_W(vec4_r);
+ FDCT32_POSTPROC_NEG_W(tmp3_w);
+ FDCT32_POSTPROC_NEG_W(vec6_r);
+ FDCT32_POSTPROC_NEG_W(vec3_r);
+ DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+ __lsx_vst(vec5, out, 32);
+ __lsx_vst(vec4, out, 48);
+
+ DUP4_ARG2(__lsx_vld, interm_ptr, 0, interm_ptr, 16, interm_ptr, 32,
+ interm_ptr, 48, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, interm_ptr, 64, interm_ptr, 80, interm_ptr, 96,
+ interm_ptr, 112, vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+ vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 64);
+ __lsx_vst(in5, out, 112);
+
+ DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 80);
+ __lsx_vst(in5, out, 96);
+
+ DUP4_ARG2(__lsx_vld, interm_ptr, 128, interm_ptr, 144, interm_ptr, 160,
+ interm_ptr, 176, in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, interm_ptr, 192, interm_ptr, 208, interm_ptr, 224,
+ interm_ptr, 240, in12, in13, in14, in15);
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+ vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 128);
+ __lsx_vst(in5, out, 240);
+
+ DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 144);
+ __lsx_vst(in5, out, 224);
+
+ DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+ tmp0_w = __lsx_vneg_h(vec2);
+ DOTP_CONST_PAIR(tmp0_w, vec5, cospi_24_64, cospi_8_64, in2, in1);
+ DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+ vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 160);
+ __lsx_vst(in5, out, 208);
+
+ DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ __lsx_vst(in4, out, 192);
+ __lsx_vst(in5, out, 176);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+ /* fdct32 even */
+ /* stage 2 */
+ DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+ in3);
+ DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+ in7);
+ DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+ in10, in11);
+ DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+ in14, in15);
+
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+ vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+ in15);
+ /* Stage 3 */
+ DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+ in1, in2, in3);
+ LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+ DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 0);
+ __lsx_vst(temp1, out, 16);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 32);
+ __lsx_vst(temp1, out, 48);
+
+ DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+ vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 64);
+ __lsx_vst(temp1, out, 112);
+
+ DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 80);
+ __lsx_vst(temp1, out, 96);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+ vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 128);
+ __lsx_vst(temp1, out, 240);
+
+ DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 144);
+ __lsx_vst(temp1, out, 224);
+
+ DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+ temp0 = __lsx_vneg_h(vec2);
+ DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+ DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+ vec2, vec5)
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 160);
+ __lsx_vst(temp1, out, 208);
+
+ DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ __lsx_vst(temp0, out, 192);
+ __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+ int16_t *out) {
+ __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+ __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+ __m128i tmp0, tmp1;
+
+ in20 = __lsx_vld(temp, 64);
+ in21 = __lsx_vld(temp, 80);
+ in26 = __lsx_vld(temp, 160);
+ in27 = __lsx_vld(temp, 176);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ in18 = __lsx_vld(temp, 32);
+ in19 = __lsx_vld(temp, 48);
+ in28 = __lsx_vld(temp, 192);
+ in29 = __lsx_vld(temp, 208);
+
+ vec4 = __lsx_vsub_h(in19, in20);
+ __lsx_vst(vec4, interm_ptr, 64);
+ vec4 = __lsx_vsub_h(in18, in21);
+ __lsx_vst(vec4, interm_ptr, 176);
+ vec4 = __lsx_vsub_h(in28, in27);
+ __lsx_vst(vec4, interm_ptr, 112);
+ vec4 = __lsx_vsub_h(in29, in26);
+ __lsx_vst(vec4, interm_ptr, 128);
+
+ DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+ in20, in27, in26);
+
+ in22 = __lsx_vld(temp, 96);
+ in23 = __lsx_vld(temp, 112);
+ in24 = __lsx_vld(temp, 128);
+ in25 = __lsx_vld(temp, 144);
+
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+ in16 = __lsx_vld(temp, 0);
+ in17 = __lsx_vld(temp, 16);
+ in30 = __lsx_vld(temp, 224);
+ in31 = __lsx_vld(temp, 240);
+
+ vec4 = __lsx_vsub_h(in17, in22);
+ __lsx_vst(vec4, interm_ptr, 80);
+ vec4 = __lsx_vsub_h(in30, in25);
+ __lsx_vst(vec4, interm_ptr, 96);
+ vec4 = __lsx_vsub_h(in31, in24);
+ __lsx_vst(vec4, interm_ptr, 144);
+ vec4 = __lsx_vsub_h(in16, in23);
+ __lsx_vst(vec4, interm_ptr, 160);
+
+ DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+ in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+ DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+ in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec5, out, 0);
+ __lsx_vst(vec4, out, 240);
+
+ DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec5, out, 224);
+ __lsx_vst(vec4, out, 16);
+
+ DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+ in26, in24, in20);
+ tmp0 = __lsx_vneg_h(in23);
+ DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+ DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec4, out, 32);
+ __lsx_vst(vec5, out, 208);
+
+ DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec4, out, 48);
+ __lsx_vst(vec5, out, 192);
+
+ in20 = __lsx_vld(interm_ptr, 64);
+ in21 = __lsx_vld(interm_ptr, 176);
+ in27 = __lsx_vld(interm_ptr, 112);
+ in26 = __lsx_vld(interm_ptr, 128);
+
+ in16 = in20;
+ in17 = in21;
+ DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+ DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+ in22 = __lsx_vld(interm_ptr, 80);
+ in25 = __lsx_vld(interm_ptr, 96);
+ in24 = __lsx_vld(interm_ptr, 144);
+ in23 = __lsx_vld(interm_ptr, 160);
+
+ DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+ in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec5, out, 64);
+ __lsx_vst(vec4, out, 176);
+
+ DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec5, out, 80);
+ __lsx_vst(vec4, out, 160);
+
+ DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+ in29, in30, in19);
+ tmp0 = __lsx_vneg_h(in16);
+ DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+ DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec5, out, 144);
+ __lsx_vst(vec4, out, 96);
+
+ DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ __lsx_vst(vec4, out, 112);
+ __lsx_vst(vec5, out, 128);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+ /* 1st set */
+ in0 = __lsx_vld(temp, 0);
+ in4 = __lsx_vld(temp, 64);
+ in2 = __lsx_vld(temp, 128);
+ in6 = __lsx_vld(temp, 192);
+ in1 = __lsx_vld(temp, 256);
+ in7 = __lsx_vld(temp, 304);
+ in3 = __lsx_vld(temp, 384);
+ in5 = __lsx_vld(temp, 432);
+
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ /* 2nd set */
+ in0_1 = __lsx_vld(temp, 32);
+ in1_1 = __lsx_vld(temp, 464);
+ in2_1 = __lsx_vld(temp, 160);
+ in3_1 = __lsx_vld(temp, 336);
+ in4_1 = __lsx_vld(temp, 96);
+ in5_1 = __lsx_vld(temp, 352);
+ in6_1 = __lsx_vld(temp, 224);
+ in7_1 = __lsx_vld(temp, 480);
+
+ __lsx_vst(in0, output, 0);
+ __lsx_vst(in1, output, 64);
+ __lsx_vst(in2, output, 128);
+ __lsx_vst(in3, output, 192);
+ __lsx_vst(in4, output, 256);
+ __lsx_vst(in5, output, 320);
+ __lsx_vst(in6, output, 384);
+ __lsx_vst(in7, output, 448);
+
+ LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+ /* 3rd set */
+ in0 = __lsx_vld(temp, 16);
+ in1 = __lsx_vld(temp, 272);
+ in2 = __lsx_vld(temp, 144);
+ in3 = __lsx_vld(temp, 400);
+ in4 = __lsx_vld(temp, 80);
+ in5 = __lsx_vld(temp, 416);
+ in6 = __lsx_vld(temp, 208);
+ in7 = __lsx_vld(temp, 288);
+
+ __lsx_vst(in0_1, output, 16);
+ __lsx_vst(in1_1, output, 80);
+ __lsx_vst(in2_1, output, 144);
+ __lsx_vst(in3_1, output, 208);
+ __lsx_vst(in4_1, output, 272);
+ __lsx_vst(in5_1, output, 336);
+ __lsx_vst(in6_1, output, 400);
+ __lsx_vst(in7_1, output, 464);
+
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ __lsx_vst(in0, output, 32);
+ __lsx_vst(in1, output, 96);
+ __lsx_vst(in2, output, 160);
+ __lsx_vst(in3, output, 224);
+ __lsx_vst(in4, output, 288);
+ __lsx_vst(in5, output, 352);
+ __lsx_vst(in6, output, 416);
+ __lsx_vst(in7, output, 480);
+
+ /* 4th set */
+ in0_1 = __lsx_vld(temp, 48);
+ in1_1 = __lsx_vld(temp, 448);
+ in2_1 = __lsx_vld(temp, 176);
+ in3_1 = __lsx_vld(temp, 320);
+ in4_1 = __lsx_vld(temp, 112);
+ in5_1 = __lsx_vld(temp, 368);
+ in6_1 = __lsx_vld(temp, 240);
+ in7_1 = __lsx_vld(temp, 496);
+
+ LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+ __lsx_vst(in0_1, output, 48);
+ __lsx_vst(in1_1, output, 112);
+ __lsx_vst(in2_1, output, 176);
+ __lsx_vst(in3_1, output, 240);
+ __lsx_vst(in4_1, output, 304);
+ __lsx_vst(in5_1, output, 368);
+ __lsx_vst(in6_1, output, 432);
+ __lsx_vst(in7_1, output, 496);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+ fdct8x32_1d_row_even(temp_buf, temp_buf);
+ fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+ fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+ int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+ fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ int i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+ /* column transform */
+ for (i = 0; i < 4; ++i) {
+ fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+ tmp_buf_big + (8 * i));
+ }
+
+ /* row transform */
+ fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+ /* row transform */
+ for (i = 1; i < 4; ++i) {
+ fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+ }
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+ /* fdct32 even */
+ /* stage 2 */
+ DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+ in3);
+ DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+ in7);
+ DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+ in10, in11);
+ DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+ in14, in15);
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+ vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+ in15);
+
+ FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+ FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+ FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+ FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+ FDCT_POSTPROC_2V_NEG_H(in8, in9);
+ FDCT_POSTPROC_2V_NEG_H(in10, in11);
+ FDCT_POSTPROC_2V_NEG_H(in12, in13);
+ FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+ /* Stage 3 */
+ DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+ in1, in2, in3);
+
+ temp0 = __lsx_vadd_h(in0, in3);
+ in0 = __lsx_vsub_h(in0, in3);
+ in3 = __lsx_vadd_h(in1, in2);
+ in1 = __lsx_vsub_h(in1, in2);
+
+ DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+ __lsx_vst(temp0, out, 0);
+ __lsx_vst(temp1, out, 16);
+
+ DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ __lsx_vst(temp0, out, 32);
+ __lsx_vst(temp1, out, 48);
+
+ DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+ vec5, vec6, vec7);
+ DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ __lsx_vst(temp0, out, 64);
+ __lsx_vst(temp1, out, 112);
+
+ DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+ DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ __lsx_vst(temp0, out, 80);
+ __lsx_vst(temp1, out, 96);
+
+ DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+ vec1, vec6, in2);
+ DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+ DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ __lsx_vst(temp0, out, 128);
+ __lsx_vst(temp1, out, 240);
+
+ DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+ DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ __lsx_vst(temp0, out, 144);
+ __lsx_vst(temp1, out, 224);
+
+ DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+ temp0 = __lsx_vneg_h(vec2);
+ DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+ DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+ vec2, vec5);
+ DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ __lsx_vst(temp0, out, 160);
+ __lsx_vst(temp1, out, 208);
+
+ DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+ DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ __lsx_vst(temp0, out, 192);
+ __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+ int16_t *out) {
+ __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+ __m128i in24, in25, in26, in27, in28, in29, in30, in31;
+ __m128i vec4, vec5, tmp0, tmp1;
+
+ in20 = __lsx_vld(temp, 64);
+ in21 = __lsx_vld(temp, 80);
+ in26 = __lsx_vld(temp, 160);
+ in27 = __lsx_vld(temp, 176);
+
+ DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ FDCT_POSTPROC_2V_NEG_H(in20, in21);
+ FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+ in18 = __lsx_vld(temp, 32);
+ in19 = __lsx_vld(temp, 48);
+ in28 = __lsx_vld(temp, 192);
+ in29 = __lsx_vld(temp, 208);
+
+ FDCT_POSTPROC_2V_NEG_H(in18, in19);
+ FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+ vec4 = __lsx_vsub_h(in19, in20);
+ __lsx_vst(vec4, interm_ptr, 64);
+ vec4 = __lsx_vsub_h(in18, in21);
+ __lsx_vst(vec4, interm_ptr, 176);
+ vec4 = __lsx_vsub_h(in29, in26);
+ __lsx_vst(vec4, interm_ptr, 128);
+ vec4 = __lsx_vsub_h(in28, in27);
+ __lsx_vst(vec4, interm_ptr, 112);
+
+ DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+ in20, in27, in26);
+
+ in22 = __lsx_vld(temp, 96);
+ in23 = __lsx_vld(temp, 112);
+ in24 = __lsx_vld(temp, 128);
+ in25 = __lsx_vld(temp, 144);
+
+ DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+ FDCT_POSTPROC_2V_NEG_H(in22, in23);
+ FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+ in16 = __lsx_vld(temp, 0);
+ in17 = __lsx_vld(temp, 16);
+ in30 = __lsx_vld(temp, 224);
+ in31 = __lsx_vld(temp, 240);
+
+ FDCT_POSTPROC_2V_NEG_H(in16, in17);
+ FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+ vec4 = __lsx_vsub_h(in17, in22);
+ __lsx_vst(vec4, interm_ptr, 80);
+ vec4 = __lsx_vsub_h(in30, in25);
+ __lsx_vst(vec4, interm_ptr, 96);
+ vec4 = __lsx_vsub_h(in31, in24);
+ __lsx_vst(vec4, interm_ptr, 144);
+ vec4 = __lsx_vsub_h(in16, in23);
+ __lsx_vst(vec4, interm_ptr, 160);
+
+ DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+ in17, in30, in31);
+ DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+ DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+ in22, in21, in25);
+ DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ __lsx_vst(vec5, out, 0);
+ __lsx_vst(vec4, out, 240);
+
+ DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ __lsx_vst(vec5, out, 224);
+ __lsx_vst(vec4, out, 16);
+
+ DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+ in26, in24, in20);
+ tmp0 = __lsx_vneg_h(in23);
+ DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+ DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+ DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ __lsx_vst(vec4, out, 32);
+ __lsx_vst(vec5, out, 208);
+
+ DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+ DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ __lsx_vst(vec4, out, 48);
+ __lsx_vst(vec5, out, 192);
+
+ in20 = __lsx_vld(interm_ptr, 64);
+ in21 = __lsx_vld(interm_ptr, 176);
+ in27 = __lsx_vld(interm_ptr, 112);
+ in26 = __lsx_vld(interm_ptr, 128);
+
+ in16 = in20;
+ in17 = in21;
+ DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+ DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+ DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+ in22 = __lsx_vld(interm_ptr, 80);
+ in25 = __lsx_vld(interm_ptr, 96);
+ in24 = __lsx_vld(interm_ptr, 144);
+ in23 = __lsx_vld(interm_ptr, 160);
+
+ DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+ in17, in18, in31);
+ DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ in16 = __lsx_vadd_h(in28, in29);
+ in19 = __lsx_vadd_h(in31, in30);
+ DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ __lsx_vst(vec5, out, 64);
+ __lsx_vst(vec4, out, 176);
+
+ DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ __lsx_vst(vec5, out, 80);
+ __lsx_vst(vec4, out, 160);
+
+ DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+ in29, in30, in19);
+ tmp0 = __lsx_vneg_h(in16);
+ DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+ DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+ DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ __lsx_vst(vec5, out, 144);
+ __lsx_vst(vec4, out, 96);
+
+ DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+ DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ __lsx_vst(vec4, out, 112);
+ __lsx_vst(vec5, out, 128);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+ int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+ fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+ fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_rd_lsx(const int16_t *input, int16_t *out,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+ /* column transform */
+ for (i = 0; i < 4; ++i) {
+ fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+ &tmp_buf_big[0] + (8 * i));
+ }
+ /* row transform */
+ for (i = 0; i < 4; ++i) {
+ fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+ out + (8 * i * 32));
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c
new file mode 100644
index 0000000000..508532b9d8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ do { \
+ __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \
+ \
+ DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \
+ DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \
+ _t0 = __lsx_vilvl_h(_s1, _s0); \
+ _t1 = __lsx_vilvh_h(_s1, _s0); \
+ _t2 = __lsx_vilvl_h(_s3, _s2); \
+ _t3 = __lsx_vilvh_h(_s3, _s2); \
+ DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \
+ DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \
+ } while (0)
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+ int32_t src_stride) {
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+ __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+ __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };
+ __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };
+ __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 };
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t src_stride6 = src_stride4 + src_stride2;
+ int32_t src_stride8 = src_stride4 << 1;
+ int16_t *input_tmp = (int16_t *)input;
+ in0 = __lsx_vld(input_tmp, 0);
+ DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+ input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4);
+ input_tmp += src_stride4;
+ DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+ input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8);
+ input_tmp += src_stride4;
+ DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+ input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11,
+ in12);
+ input_tmp += src_stride4;
+ DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13,
+ in14);
+ input_tmp += src_stride2;
+ in15 = __lsx_vldx(input_tmp, src_stride2);
+
+ DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+ in11);
+ DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+ in15);
+ DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0,
+ tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5,
+ tmp6, tmp7);
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ __lsx_vst(tmp0, tmp_ptr, 0);
+ __lsx_vst(tmp1, tmp_ptr, 64);
+ __lsx_vst(tmp2, tmp_ptr, 128);
+ __lsx_vst(tmp3, tmp_ptr, 192);
+ __lsx_vst(tmp4, tmp_ptr, 256);
+ __lsx_vst(tmp5, tmp_ptr, 320);
+ __lsx_vst(tmp6, tmp_ptr, 384);
+ __lsx_vst(tmp7, tmp_ptr, 448);
+ DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15,
+ in14, in13, in12);
+ DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10,
+ in9, in8);
+
+ tmp_ptr += 16;
+
+ /* stp 1 */
+ DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4);
+ DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5);
+
+ cnst4 = __lsx_vreplvei_h(coeff, 0);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25);
+
+ cnst5 = __lsx_vreplvei_h(coeff, 1);
+ cnst5 = __lsx_vpackev_h(cnst5, cnst4);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23);
+
+ /* stp2 */
+ LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+ LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+ DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4);
+ DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26);
+
+ cnst0 = __lsx_vreplvei_h(coeff, 4);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21);
+
+ LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+ vec1 = __lsx_vilvl_h(in15, in8);
+ vec0 = __lsx_vilvh_h(in15, in8);
+
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 0);
+
+ cnst0 = __lsx_vreplvei_h(coeff2, 0);
+ cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 448);
+
+ vec1 = __lsx_vilvl_h(in14, in9);
+ vec0 = __lsx_vilvh_h(in14, in9);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+ __lsx_vst(in8, tmp_ptr, 256);
+
+ cnst1 = __lsx_vreplvei_h(coeff2, 2);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 192);
+
+ DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25);
+
+ cnst1 = __lsx_vreplvei_h(coeff, 3);
+ cnst1 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22);
+
+ /* stp4 */
+ DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10);
+
+ vec1 = __lsx_vilvl_h(in13, in10);
+ vec0 = __lsx_vilvh_h(in13, in10);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 128);
+
+ cnst0 = __lsx_vreplvei_h(coeff2, 1);
+ cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 320);
+
+ DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11);
+ vec1 = __lsx_vilvl_h(in12, in11);
+ vec0 = __lsx_vilvh_h(in12, in11);
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1);
+ cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+ __lsx_vst(in8, tmp_ptr, 384);
+
+ cnst1 = __lsx_vreplvei_h(coeff2, 3);
+ cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+ DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+ __lsx_vst(in8, tmp_ptr, 64);
+}
+
+void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ int16_t *input_tmp = input;
+
+ DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2,
+ in3);
+ DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5,
+ in6, in7);
+ DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp,
+ 112, in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208,
+ input_tmp, 240, in12, in13, in14, in15);
+
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10,
+ in11);
+ DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13,
+ in14, in15);
+
+ DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+ in11);
+ DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+ in15);
+ LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+ in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4,
+ tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14,
+ in15);
+ __lsx_vst(in8, input, 0);
+ __lsx_vst(in9, input, 32);
+ __lsx_vst(in10, input, 64);
+ __lsx_vst(in11, input, 96);
+ __lsx_vst(in12, input, 128);
+ __lsx_vst(in13, input, 160);
+ __lsx_vst(in14, input, 192);
+ __lsx_vst(in15, input, 224);
+
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9,
+ in10, in11);
+ DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12,
+ in13, in14, in15);
+ FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+ tmp1, in1, tmp2, in2, tmp3, in3);
+ __lsx_vst(tmp0, output, 0);
+ __lsx_vst(in0, output, 32);
+ __lsx_vst(tmp1, output, 64);
+ __lsx_vst(in1, output, 96);
+ __lsx_vst(tmp2, output, 128);
+ __lsx_vst(in2, output, 160);
+ __lsx_vst(tmp3, output, 192);
+ __lsx_vst(in3, output, 224);
+
+ LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+ tmp5, in5, tmp6, in6, tmp7, in7);
+ __lsx_vst(tmp4, output, 16);
+ __lsx_vst(in4, output, 48);
+ __lsx_vst(tmp5, output, 80);
+ __lsx_vst(in5, output, 112);
+ __lsx_vst(tmp6, output, 144);
+ __lsx_vst(in6, output, 176);
+ __lsx_vst(tmp7, output, 208);
+ __lsx_vst(in7, output, 240);
+}
+
+void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ __m128i in0, in1, in2, in3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t src_stride6 = src_stride4 + src_stride2;
+
+ in0 = __lsx_vld(input, 0);
+ DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
+ in3 = __lsx_vldx(input, src_stride6);
+
+ /* fdct4 pre-process */
+ {
+ __m128i vec, mask;
+ __m128i zero = __lsx_vldi(0);
+
+ mask = __lsx_vinsgr2vr_b(zero, 1, 0);
+ DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
+ in3);
+ vec = __lsx_vseqi_h(in0, 0);
+ vec = __lsx_vxori_b(vec, 255);
+ vec = __lsx_vand_v(mask, vec);
+ in0 = __lsx_vadd_h(in0, vec);
+ }
+
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
+ __lsx_vst(in0, output, 0);
+ __lsx_vst(in2, output, 16);
+}
+
+void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t src_stride6 = src_stride4 + src_stride2;
+ int16_t *input_tmp = (int16_t *)input;
+
+ in0 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
+ in2);
+ in3 = __lsx_vldx(input_tmp, src_stride6);
+ input_tmp += src_stride4;
+ in4 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
+ in6);
+ in7 = __lsx_vldx(input_tmp, src_stride6);
+
+ DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+
+ __lsx_vst(in0, output, 0);
+ __lsx_vst(in1, output, 16);
+ __lsx_vst(in2, output, 32);
+ __lsx_vst(in3, output, 48);
+ __lsx_vst(in4, output, 64);
+ __lsx_vst(in5, output, 80);
+ __lsx_vst(in6, output, 96);
+ __lsx_vst(in7, output, 112);
+}
+
+void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+ }
+}
+#endif // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h
new file mode 100644
index 0000000000..4a9fce9a3d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+
+#include "vpx_dsp/loongarch/txfm_macros_lsx.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ do { \
+ __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \
+ __m128i vec4_m, vec5_m, vec6_m, vec7_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \
+ \
+ LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
+ DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \
+ cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \
+ vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
+ \
+ vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \
+ cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \
+ cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \
+ vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
+ \
+ DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \
+ vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
+ vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \
+ } while (0)
+
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ do { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
+ __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \
+ \
+ /* FDCT stage1 */ \
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
+ s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
+ LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \
+ x2_m = __lsx_vneg_h(x2_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \
+ x2_m = __lsx_vreplvei_h(coeff_m, 2); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \
+ \
+ /* stage2 */ \
+ s1_m = __lsx_vilvl_h(s5_m, s6_m); \
+ s0_m = __lsx_vilvh_h(s5_m, s6_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \
+ \
+ /* stage3 */ \
+ LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x0_m, x1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \
+ \
+ x1_m = __lsx_vreplvei_h(coeff_m, 5); \
+ x0_m = __lsx_vneg_h(x0_m); \
+ x0_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \
+ x2_m = __lsx_vreplvei_h(coeff_m, 6); \
+ x3_m = __lsx_vneg_h(x3_m); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \
+ } while (0)
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
+ do { \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ \
+ DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \
+ vec1_m, vec2_m, vec3_m); \
+ DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \
+ vec5_m, vec6_m, vec7_m); \
+ DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \
+ in3, in0, in1, in2, in3); \
+ DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \
+ in7, in4, in5, in6, in7); \
+ } while (0)
+
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
+ do { \
+ __m128i tp0_m, tp1_m; \
+ __m128i one = __lsx_vreplgr2vr_h(1); \
+ \
+ tp0_m = __lsx_vslei_h(vec0, 0); \
+ tp1_m = __lsx_vslei_h(vec1, 0); \
+ tp0_m = __lsx_vxori_b(tp0_m, 255); \
+ tp1_m = __lsx_vxori_b(tp1_m, 255); \
+ vec0 = __lsx_vadd_h(vec0, one); \
+ vec1 = __lsx_vadd_h(vec1, one); \
+ tp0_m = __lsx_vand_v(one, tp0_m); \
+ tp1_m = __lsx_vand_v(one, tp1_m); \
+ vec0 = __lsx_vadd_h(vec0, tp0_m); \
+ vec1 = __lsx_vadd_h(vec1, tp1_m); \
+ vec0 = __lsx_vsrai_h(vec0, 2); \
+ vec1 = __lsx_vsrai_h(vec1, 2); \
+ } while (0)
+
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
+ do { \
+ __m128i tp0_m, tp1_m; \
+ __m128i one_m = __lsx_vldi(0x401); \
+ \
+ tp0_m = __lsx_vslti_h(vec0, 0); \
+ tp1_m = __lsx_vslti_h(vec1, 0); \
+ vec0 = __lsx_vadd_h(vec0, one_m); \
+ vec1 = __lsx_vadd_h(vec1, one_m); \
+ tp0_m = __lsx_vand_v(one_m, tp0_m); \
+ tp1_m = __lsx_vand_v(one_m, tp1_m); \
+ vec0 = __lsx_vadd_h(vec0, tp0_m); \
+ vec1 = __lsx_vadd_h(vec1, tp1_m); \
+ vec0 = __lsx_vsrai_h(vec0, 2); \
+ vec1 = __lsx_vsrai_h(vec1, 2); \
+ } while (0)
+
+#define FDCT32_POSTPROC_NEG_W(vec) \
+ do { \
+ __m128i temp_m; \
+ __m128i one_m = __lsx_vreplgr2vr_w(1); \
+ \
+ temp_m = __lsx_vslti_w(vec, 0); \
+ vec = __lsx_vadd_w(vec, one_m); \
+ temp_m = __lsx_vand_v(one_m, temp_m); \
+ vec = __lsx_vadd_w(vec, temp_m); \
+ vec = __lsx_vsrai_w(vec, 2); \
+ } while (0)
+
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
+ const0, const1, out0, out1, out2, out3) \
+ do { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1; \
+ __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0); \
+ \
+ s0_m = __lsx_vreplgr2vr_w((int32_t)const1); \
+ k0_m = __lsx_vpackev_w(s0_m, k0_m); \
+ \
+ DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1); \
+ s1_m = __lsx_vilvl_w(_tmp0, reg0_left); \
+ s0_m = __lsx_vilvh_w(_tmp0, reg0_left); \
+ s3_m = __lsx_vilvl_w(reg0_left, reg1_left); \
+ s2_m = __lsx_vilvh_w(reg0_left, reg1_left); \
+ s5_m = __lsx_vilvl_w(_tmp1, reg0_right); \
+ s4_m = __lsx_vilvh_w(_tmp1, reg0_right); \
+ s7_m = __lsx_vilvl_w(reg0_right, reg1_right); \
+ s6_m = __lsx_vilvh_w(reg0_right, reg1_right); \
+ DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m); \
+ DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m); \
+ DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+ DCT_CONST_BITS, out0, out1); \
+ DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m); \
+ DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m); \
+ DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+ DCT_CONST_BITS, out2, out3); \
+ } while (0)
+
+#define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2, \
+ in3) \
+ do { \
+ __m128i dst0_m, dst1_m, dst2_m, dst3_m; \
+ __m128i tmp0_m, tmp1_m; \
+ __m128i res0_m, res1_m, res2_m, res3_m; \
+ \
+ dst0_m = __lsx_vld(dst, 0); \
+ DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m); \
+ dst3_m = __lsx_vldx(dst, _stride3); \
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \
+ res0_m, res1_m, res2_m, res3_m); \
+ DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m, \
+ in3, res0_m, res1_m, res2_m, res3_m); \
+ DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0, \
+ tmp0_m, tmp1_m); \
+ __lsx_vstelm_d(tmp0_m, dst, 0, 0); \
+ __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1); \
+ __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0); \
+ __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \
+ } while (0)
+
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ do { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ __m128i x0_m, x1_m, x2_m, x3_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \
+ \
+ /* FDCT stage1 */ \
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
+ s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
+ LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \
+ x2_m = __lsx_vneg_h(x2_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \
+ x2_m = __lsx_vreplvei_h(coeff_m, 2); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \
+ \
+ /* stage2 */ \
+ s1_m = __lsx_vilvl_h(s5_m, s6_m); \
+ s0_m = __lsx_vilvh_h(s5_m, s6_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \
+ \
+ /* stage3 */ \
+ LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x0_m, x1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \
+ \
+ x1_m = __lsx_vreplvei_h(coeff_m, 5); \
+ x0_m = __lsx_vneg_h(x0_m); \
+ x0_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \
+ \
+ x2_m = __lsx_vreplvei_h(coeff_m, 6); \
+ x3_m = __lsx_vneg_h(x3_m); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \
+ } while (0)
+
+#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \
+ input7, out1, out3, out5, out7, out9, out11, out13, \
+ out15) \
+ do { \
+ __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \
+ __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \
+ __m128i stp36_m, stp37_m, vec0_m, vec1_m; \
+ __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \
+ __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; \
+ __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; \
+ __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 }; \
+ \
+ /* stp 1 */ \
+ DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \
+ DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \
+ \
+ cnst4_m = __lsx_vreplvei_h(coeff_m, 0); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m); \
+ \
+ cnst5_m = __lsx_vreplvei_h(coeff_m, 1); \
+ cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m); \
+ \
+ /* stp2 */ \
+ LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, \
+ stp32_m, stp33_m); \
+ LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, \
+ stp35_m, stp34_m); \
+ \
+ DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, \
+ vec4_m); \
+ DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, \
+ vec5_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff_m, 4); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff_m, 3); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m); \
+ \
+ /* stp4 */ \
+ LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, \
+ vec4_m, vec5_m); \
+ LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, \
+ stp24_m, stp31_m); \
+ \
+ vec1_m = __lsx_vilvl_h(vec2_m, vec6_m); \
+ vec0_m = __lsx_vilvh_h(vec2_m, vec6_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff2_m, 0); \
+ cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15); \
+ \
+ vec1_m = __lsx_vilvl_h(vec4_m, vec5_m); \
+ vec0_m = __lsx_vilvh_h(vec4_m, vec5_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9); \
+ \
+ cnst1_m = __lsx_vreplvei_h(coeff2_m, 2); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7); \
+ \
+ vec1_m = __lsx_vilvl_h(stp23_m, stp21_m); \
+ vec0_m = __lsx_vilvh_h(stp23_m, stp21_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5); \
+ \
+ cnst0_m = __lsx_vreplvei_h(coeff2_m, 1); \
+ cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11); \
+ \
+ vec1_m = __lsx_vilvl_h(stp24_m, stp31_m); \
+ vec0_m = __lsx_vilvh_h(stp24_m, stp31_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13); \
+ \
+ cnst1_m = __lsx_vreplvei_h(coeff2_m, 3); \
+ cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3); \
+ } while (0)
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+ int32_t src_stride);
+void fdct16x8_1d_row(int16_t *input, int16_t *output);
+#endif // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c
new file mode 100644
index 0000000000..ec07f57d90
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c
@@ -0,0 +1,834 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#define UNPCK_UB_SH(_in, _out0, _out1) \
+ do { \
+ _out0 = __lsx_vsllwil_hu_bu(_in, 0); \
+ _out1 = __lsx_vexth_hu_bu(_in); \
+ } while (0)
+
+static void idct32x8_row_transpose_store(const int16_t *input,
+ int16_t *tmp_buf) {
+ __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+ __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+
+ /* 1st & 2nd 8x8 */
+ DUP4_ARG2(__lsx_vld, input, 0, input, 64, input, 128, input, 192, m0, n0, m1,
+ n1);
+ DUP4_ARG2(__lsx_vld, input, 256, input, 320, input, 384, input, 448, m2, n2,
+ m3, n3);
+ DUP4_ARG2(__lsx_vld, input, 16, input, 80, input, 144, input, 208, m4, n4, m5,
+ n5);
+ DUP4_ARG2(__lsx_vld, input, 272, input, 336, input, 400, input, 464, m6, n6,
+ m7, n7);
+
+ LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+
+ __lsx_vst(m0, tmp_buf, 0);
+ __lsx_vst(n0, tmp_buf, 16);
+ __lsx_vst(m1, tmp_buf, 32);
+ __lsx_vst(n1, tmp_buf, 48);
+ __lsx_vst(m2, tmp_buf, 64);
+ __lsx_vst(n2, tmp_buf, 80);
+ __lsx_vst(m3, tmp_buf, 96);
+ __lsx_vst(n3, tmp_buf, 112);
+ __lsx_vst(m4, tmp_buf, 128);
+ __lsx_vst(n4, tmp_buf, 144);
+ __lsx_vst(m5, tmp_buf, 160);
+ __lsx_vst(n5, tmp_buf, 176);
+ __lsx_vst(m6, tmp_buf, 192);
+ __lsx_vst(n6, tmp_buf, 208);
+ __lsx_vst(m7, tmp_buf, 224);
+ __lsx_vst(n7, tmp_buf, 240);
+
+ /* 3rd & 4th 8x8 */
+ DUP4_ARG2(__lsx_vld, input, 32, input, 96, input, 160, input, 224, m0, n0, m1,
+ n1);
+ DUP4_ARG2(__lsx_vld, input, 288, input, 352, input, 416, input, 480, m2, n2,
+ m3, n3);
+ DUP4_ARG2(__lsx_vld, input, 48, input, 112, input, 176, input, 240, m4, n4,
+ m5, n5);
+ DUP4_ARG2(__lsx_vld, input, 304, input, 368, input, 432, input, 496, m6, n6,
+ m7, n7);
+
+ LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+
+ __lsx_vst(m0, tmp_buf, 256);
+ __lsx_vst(n0, tmp_buf, 272);
+ __lsx_vst(m1, tmp_buf, 288);
+ __lsx_vst(n1, tmp_buf, 304);
+ __lsx_vst(m2, tmp_buf, 320);
+ __lsx_vst(n2, tmp_buf, 336);
+ __lsx_vst(m3, tmp_buf, 352);
+ __lsx_vst(n3, tmp_buf, 368);
+ __lsx_vst(m4, tmp_buf, 384);
+ __lsx_vst(n4, tmp_buf, 400);
+ __lsx_vst(m5, tmp_buf, 416);
+ __lsx_vst(n5, tmp_buf, 432);
+ __lsx_vst(m6, tmp_buf, 448);
+ __lsx_vst(n6, tmp_buf, 464);
+ __lsx_vst(m7, tmp_buf, 480);
+ __lsx_vst(n7, tmp_buf, 496);
+}
+
+static void idct32x8_row_even_process_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+ __m128i tmp0;
+
+ /* Even stage 1 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 64, tmp_buf, 128, tmp_buf, 192,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 320, tmp_buf, 384, tmp_buf, 448,
+ reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+ LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ loc1 = vec3;
+ loc0 = vec1;
+
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+ LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+ LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+ LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+ /* Even stage 2 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 32, tmp_buf, 96, tmp_buf, 160, tmp_buf, 224,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 288, tmp_buf, 352, tmp_buf, 416, tmp_buf, 480,
+ reg4, reg5, reg6, reg7);
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+ vec0 = __lsx_vadd_h(reg0, reg4);
+ reg0 = __lsx_vsub_h(reg0, reg4);
+ reg4 = __lsx_vadd_h(reg6, reg2);
+ reg6 = __lsx_vsub_h(reg6, reg2);
+ reg2 = __lsx_vadd_h(reg1, reg5);
+ reg1 = __lsx_vsub_h(reg1, reg5);
+ reg5 = __lsx_vadd_h(reg7, reg3);
+ reg7 = __lsx_vsub_h(reg7, reg3);
+ reg3 = vec0;
+
+ vec1 = reg2;
+ reg2 = __lsx_vadd_h(reg3, reg4);
+ reg3 = __lsx_vsub_h(reg3, reg4);
+ reg4 = __lsx_vsub_h(reg5, vec1);
+ reg5 = __lsx_vadd_h(reg5, vec1);
+
+ tmp0 = __lsx_vneg_h(reg6);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+ vec0 = __lsx_vsub_h(reg0, reg6);
+ reg0 = __lsx_vadd_h(reg0, reg6);
+ vec1 = __lsx_vsub_h(reg7, reg1);
+ reg7 = __lsx_vadd_h(reg7, reg1);
+
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+ /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+ LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc0, tmp_eve_buf, 240);
+ __lsx_vst(loc1, tmp_eve_buf, 0);
+ __lsx_vst(loc2, tmp_eve_buf, 224);
+ __lsx_vst(loc3, tmp_eve_buf, 16);
+
+ LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc0, tmp_eve_buf, 208);
+ __lsx_vst(loc1, tmp_eve_buf, 32);
+ __lsx_vst(loc2, tmp_eve_buf, 192);
+ __lsx_vst(loc3, tmp_eve_buf, 48);
+
+ /* Store 8 */
+ LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc0, tmp_eve_buf, 176);
+ __lsx_vst(loc1, tmp_eve_buf, 64);
+ __lsx_vst(loc2, tmp_eve_buf, 160);
+ __lsx_vst(loc3, tmp_eve_buf, 80);
+
+ LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc0, tmp_eve_buf, 144);
+ __lsx_vst(loc1, tmp_eve_buf, 96);
+ __lsx_vst(loc2, tmp_eve_buf, 128);
+ __lsx_vst(loc3, tmp_eve_buf, 112);
+}
+
+static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
+ int16_t *tmp_odd_buf) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+ /* Odd stage 1 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 16, tmp_buf, 112, tmp_buf, 144, tmp_buf, 240,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 272, tmp_buf, 368, tmp_buf, 400, tmp_buf, 496,
+ reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+ vec0 = __lsx_vadd_h(reg0, reg3);
+ reg0 = __lsx_vsub_h(reg0, reg3);
+ reg3 = __lsx_vadd_h(reg7, reg4);
+ reg7 = __lsx_vsub_h(reg7, reg4);
+ reg4 = __lsx_vadd_h(reg1, reg2);
+ reg1 = __lsx_vsub_h(reg1, reg2);
+ reg2 = __lsx_vadd_h(reg6, reg5);
+ reg6 = __lsx_vsub_h(reg6, reg5);
+ reg5 = vec0;
+
+ /* 4 Stores */
+ DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 64);
+ __lsx_vst(vec1, tmp_odd_buf, 80);
+
+ DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 0);
+ __lsx_vst(vec1, tmp_odd_buf, 16);
+
+ /* 4 Stores */
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+ LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+ __lsx_vst(vec0, tmp_odd_buf, 96);
+ __lsx_vst(vec1, tmp_odd_buf, 112);
+
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ __lsx_vst(vec2, tmp_odd_buf, 32);
+ __lsx_vst(vec3, tmp_odd_buf, 48);
+
+ /* Odd stage 2 */
+ /* 8 loads */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 48, tmp_buf, 80, tmp_buf, 176, tmp_buf, 208,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 304, tmp_buf, 336, tmp_buf, 432, tmp_buf, 464,
+ reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+ /* 4 Stores */
+ DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+ vec1, vec2, vec3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+ LSX_BUTTERFLY_4_H(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
+ __lsx_vst(vec0, tmp_odd_buf, 192);
+ __lsx_vst(vec1, tmp_odd_buf, 240);
+
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 160);
+ __lsx_vst(vec1, tmp_odd_buf, 176);
+
+ /* 4 Stores */
+ DUP4_ARG2(__lsx_vadd_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1,
+ vec2, vec0, vec3);
+ LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+ __lsx_vst(reg0, tmp_odd_buf, 208);
+ __lsx_vst(reg1, tmp_odd_buf, 224);
+
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+ __lsx_vst(reg0, tmp_odd_buf, 128);
+ __lsx_vst(reg1, tmp_odd_buf, 144);
+
+ /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+ tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+ tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+ loc1, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 0);
+ __lsx_vst(loc1, tmp_odd_buf, 16);
+ __lsx_vst(loc2, tmp_odd_buf, 32);
+ __lsx_vst(loc3, tmp_odd_buf, 48);
+
+ DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+ DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 128);
+ __lsx_vst(loc1, tmp_odd_buf, 144);
+ __lsx_vst(loc2, tmp_odd_buf, 160);
+ __lsx_vst(loc3, tmp_odd_buf, 176);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+ tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+ tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+ loc1, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 64);
+ __lsx_vst(loc1, tmp_odd_buf, 80);
+ __lsx_vst(loc2, tmp_odd_buf, 96);
+ __lsx_vst(loc3, tmp_odd_buf, 112);
+
+ DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+ DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 192);
+ __lsx_vst(loc1, tmp_odd_buf, 208);
+ __lsx_vst(loc2, tmp_odd_buf, 224);
+ __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct_butterfly_transpose_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf,
+ int16_t *tmp_odd_buf, int16_t *dst) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+ __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+ __m128i reg0, reg1, reg2, reg3;
+
+ /* FINAL BUTTERFLY : Dependency on Even & Odd */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+ tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+ tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+ m4, m2, m6);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+ reg1, reg2, reg3);
+ __lsx_vst(reg0, tmp_buf, 496);
+ __lsx_vst(reg1, tmp_buf, 368);
+ __lsx_vst(reg2, tmp_buf, 432);
+ __lsx_vst(reg3, tmp_buf, 304);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+ tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+ tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+ m5, m3, m7);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+ reg1, reg2, reg3);
+ __lsx_vst(reg0, tmp_buf, 464);
+ __lsx_vst(reg1, tmp_buf, 336);
+ __lsx_vst(reg2, tmp_buf, 400);
+ __lsx_vst(reg3, tmp_buf, 272);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+ tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+ tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+ n4, n2, n6);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+ reg1, reg2, reg3);
+ __lsx_vst(reg0, tmp_buf, 480);
+ __lsx_vst(reg1, tmp_buf, 352);
+ __lsx_vst(reg2, tmp_buf, 416);
+ __lsx_vst(reg3, tmp_buf, 288);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+ tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+ tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+ n5, n3, n7);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+ reg1, reg2, reg3);
+ __lsx_vst(reg0, tmp_buf, 448);
+ __lsx_vst(reg1, tmp_buf, 320);
+ __lsx_vst(reg2, tmp_buf, 384);
+ __lsx_vst(reg3, tmp_buf, 256);
+
+ /* Transpose : 16 vectors */
+ /* 1st & 2nd 8x8 */
+ LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ __lsx_vst(m0, dst, 0);
+ __lsx_vst(n0, dst, 64);
+ __lsx_vst(m1, dst, 128);
+ __lsx_vst(n1, dst, 192);
+ __lsx_vst(m2, dst, 256);
+ __lsx_vst(n2, dst, 320);
+ __lsx_vst(m3, dst, 384);
+ __lsx_vst(n3, dst, 448);
+
+ LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ __lsx_vst(m4, dst, 16);
+ __lsx_vst(n4, dst, 80);
+ __lsx_vst(m5, dst, 144);
+ __lsx_vst(n5, dst, 208);
+ __lsx_vst(m6, dst, 272);
+ __lsx_vst(n6, dst, 336);
+ __lsx_vst(m7, dst, 400);
+ __lsx_vst(n7, dst, 464);
+
+ /* 3rd & 4th 8x8 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 272, tmp_buf, 288, tmp_buf, 304,
+ m0, n0, m1, n1);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 320, tmp_buf, 336, tmp_buf, 352, tmp_buf, 368,
+ m2, n2, m3, n3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 384, tmp_buf, 400, tmp_buf, 416, tmp_buf, 432,
+ m4, n4, m5, n5);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 448, tmp_buf, 464, tmp_buf, 480, tmp_buf, 496,
+ m6, n6, m7, n7);
+ LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+ n3);
+ LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+ n7);
+ __lsx_vst(m0, dst, 32);
+ __lsx_vst(n0, dst, 96);
+ __lsx_vst(m1, dst, 160);
+ __lsx_vst(n1, dst, 224);
+ __lsx_vst(m2, dst, 288);
+ __lsx_vst(n2, dst, 352);
+ __lsx_vst(m3, dst, 416);
+ __lsx_vst(n3, dst, 480);
+ __lsx_vst(m4, dst, 48);
+ __lsx_vst(n4, dst, 112);
+ __lsx_vst(m5, dst, 176);
+ __lsx_vst(n5, dst, 240);
+ __lsx_vst(m6, dst, 304);
+ __lsx_vst(n6, dst, 368);
+ __lsx_vst(m7, dst, 432);
+ __lsx_vst(n7, dst, 496);
+}
+
+static void idct32x8_1d_rows_lsx(const int16_t *input, int16_t *output) {
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+ DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+ DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+ idct32x8_row_transpose_store(input, &tmp_buf[0]);
+ idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+ idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+ idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
+ output);
+}
+
+static void idct8x32_column_even_process_store(int16_t *tmp_buf,
+ int16_t *tmp_eve_buf) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+ __m128i tmp0;
+
+ /* Even stage 1 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+ 1792, reg4, reg5, reg6, reg7);
+ tmp_buf += 64;
+
+ DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+ DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+ LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+ DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+ loc1 = vec3;
+ loc0 = vec1;
+
+ DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+ DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+ LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+ LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+ LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+ /* Even stage 2 */
+ /* Load 8 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+ 1792, reg4, reg5, reg6, reg7);
+ DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+ vec0 = __lsx_vadd_h(reg0, reg4);
+ reg0 = __lsx_vsub_h(reg0, reg4);
+ reg4 = __lsx_vadd_h(reg6, reg2);
+ reg6 = __lsx_vsub_h(reg6, reg2);
+ reg2 = __lsx_vadd_h(reg1, reg5);
+ reg1 = __lsx_vsub_h(reg1, reg5);
+ reg5 = __lsx_vadd_h(reg7, reg3);
+ reg7 = __lsx_vsub_h(reg7, reg3);
+ reg3 = vec0;
+
+ vec1 = reg2;
+ reg2 = __lsx_vadd_h(reg3, reg4);
+ reg3 = __lsx_vsub_h(reg3, reg4);
+ reg4 = __lsx_vsub_h(reg5, vec1);
+ reg5 = __lsx_vadd_h(reg5, vec1);
+
+ tmp0 = __lsx_vneg_h(reg6);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+ DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+ vec0 = __lsx_vsub_h(reg0, reg6);
+ reg0 = __lsx_vadd_h(reg0, reg6);
+ vec1 = __lsx_vsub_h(reg7, reg1);
+ reg7 = __lsx_vadd_h(reg7, reg1);
+
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+ /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+ /* Store 8 */
+ LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc1, tmp_eve_buf, 0);
+ __lsx_vst(loc3, tmp_eve_buf, 16);
+ __lsx_vst(loc2, tmp_eve_buf, 224);
+ __lsx_vst(loc0, tmp_eve_buf, 240);
+
+ LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc1, tmp_eve_buf, 32);
+ __lsx_vst(loc3, tmp_eve_buf, 48);
+ __lsx_vst(loc2, tmp_eve_buf, 192);
+ __lsx_vst(loc0, tmp_eve_buf, 208);
+
+ /* Store 8 */
+ LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc1, tmp_eve_buf, 64);
+ __lsx_vst(loc3, tmp_eve_buf, 80);
+ __lsx_vst(loc2, tmp_eve_buf, 160);
+ __lsx_vst(loc0, tmp_eve_buf, 176);
+
+ LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+ __lsx_vst(loc1, tmp_eve_buf, 96);
+ __lsx_vst(loc3, tmp_eve_buf, 112);
+ __lsx_vst(loc2, tmp_eve_buf, 128);
+ __lsx_vst(loc0, tmp_eve_buf, 144);
+}
+
+static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
+ int16_t *tmp_odd_buf) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+ /* Odd stage 1 */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 448, tmp_buf, 576, tmp_buf, 960,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 1088, tmp_buf, 1472, tmp_buf, 1600, tmp_buf,
+ 1984, reg4, reg5, reg6, reg7);
+
+ DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+ vec0 = __lsx_vadd_h(reg0, reg3);
+ reg0 = __lsx_vsub_h(reg0, reg3);
+ reg3 = __lsx_vadd_h(reg7, reg4);
+ reg7 = __lsx_vsub_h(reg7, reg4);
+ reg4 = __lsx_vadd_h(reg1, reg2);
+ reg1 = __lsx_vsub_h(reg1, reg2);
+ reg2 = __lsx_vadd_h(reg6, reg5);
+ reg6 = __lsx_vsub_h(reg6, reg5);
+ reg5 = vec0;
+
+ /* 4 Stores */
+ DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 64);
+ __lsx_vst(vec1, tmp_odd_buf, 80);
+ DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 0);
+ __lsx_vst(vec1, tmp_odd_buf, 16);
+
+ /* 4 Stores */
+ DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+ DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+ LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+ DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+ __lsx_vst(vec0, tmp_odd_buf, 96);
+ __lsx_vst(vec1, tmp_odd_buf, 112);
+ __lsx_vst(vec2, tmp_odd_buf, 32);
+ __lsx_vst(vec3, tmp_odd_buf, 48);
+
+ /* Odd stage 2 */
+ /* 8 loads */
+ DUP4_ARG2(__lsx_vld, tmp_buf, 192, tmp_buf, 320, tmp_buf, 704, tmp_buf, 832,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 1216, tmp_buf, 1344, tmp_buf, 1728, tmp_buf,
+ 1856, reg4, reg5, reg6, reg7);
+ DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+ DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+ DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+ DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+ /* 4 Stores */
+ DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+ vec1, vec2, vec3);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+ LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+ __lsx_vst(vec0, tmp_odd_buf, 192);
+ __lsx_vst(vec1, tmp_odd_buf, 240);
+ DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+ __lsx_vst(vec0, tmp_odd_buf, 160);
+ __lsx_vst(vec1, tmp_odd_buf, 176);
+
+ /* 4 Stores */
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0,
+ vec1, vec2, vec3);
+ LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+ __lsx_vst(reg0, tmp_odd_buf, 208);
+ __lsx_vst(reg1, tmp_odd_buf, 224);
+ DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+ __lsx_vst(reg0, tmp_odd_buf, 128);
+ __lsx_vst(reg1, tmp_odd_buf, 144);
+
+ /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+ tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+ tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+ loc1, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 0);
+ __lsx_vst(loc1, tmp_odd_buf, 16);
+ __lsx_vst(loc2, tmp_odd_buf, 32);
+ __lsx_vst(loc3, tmp_odd_buf, 48);
+
+ DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+ DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 128);
+ __lsx_vst(loc1, tmp_odd_buf, 144);
+ __lsx_vst(loc2, tmp_odd_buf, 160);
+ __lsx_vst(loc3, tmp_odd_buf, 176);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+ tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+ tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+ loc1, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 64);
+ __lsx_vst(loc1, tmp_odd_buf, 80);
+ __lsx_vst(loc2, tmp_odd_buf, 96);
+ __lsx_vst(loc3, tmp_odd_buf, 112);
+
+ DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+ DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+ DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+ __lsx_vst(loc0, tmp_odd_buf, 192);
+ __lsx_vst(loc1, tmp_odd_buf, 208);
+ __lsx_vst(loc2, tmp_odd_buf, 224);
+ __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+ int16_t *tmp_odd_buf, uint8_t *dst,
+ int32_t dst_stride) {
+ __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+ __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+ __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+ int32_t stride = dst_stride << 2;
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride + stride2;
+
+ /* FINAL BUTTERFLY : Dependency on Even & Odd */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+ tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+ tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+ m4, m2, m6);
+ DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+ VP9_ADDBLK_ST8x4_UB(dst, stride, stride2, stride3, m0, m2, m4, m6);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6,
+ m2, m4, m0);
+ DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+ VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), stride, stride2, stride3, m0, m2,
+ m4, m6);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+ tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+ tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+ m5, m3, m7);
+ DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+ VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), stride, stride2, stride3, m1, m3,
+ m5, m7);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7,
+ m3, m5, m1);
+ DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+ VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), stride, stride2, stride3, m1, m3,
+ m5, m7);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+ tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+ tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+ n4, n2, n6);
+ DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+ VP9_ADDBLK_ST8x4_UB((dst + dst_stride), stride, stride2, stride3, n0, n2, n4,
+ n6);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6,
+ n2, n4, n0);
+ DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+ VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), stride, stride2, stride3, n0, n2,
+ n4, n6);
+
+ /* Load 8 & Store 8 */
+ DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+ tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+ tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+ DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+ n5, n3, n7);
+ DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+ VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), stride, stride2, stride3, n1, n3,
+ n5, n7);
+ DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7,
+ n3, n5, n1);
+ DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+ VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), stride, stride2, stride3, n1, n3,
+ n5, n7);
+}
+
+static void idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+ DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+ idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+ idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+ idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
+ dst_stride);
+}
+
+void vpx_idct32x32_1024_add_lsx(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+ int16_t *out_ptr = out_arr;
+
+ /* transform rows */
+ for (i = 0; i < 4; ++i) {
+ /* process 32 * 8 block */
+ idct32x8_1d_rows_lsx((input + (i << 8)), (out_ptr + (i << 8)));
+ }
+
+ for (i = 0; i < 4; ++i) {
+ /* process 8 * 32 block */
+ idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void vpx_idct32x32_34_add_lsx(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+ int16_t *out_ptr = out_arr;
+ __m128i zero = __lsx_vldi(0);
+
+ for (i = 32; i--;) {
+ __lsx_vst(zero, out_ptr, 0);
+ __lsx_vst(zero, out_ptr, 16);
+ __lsx_vst(zero, out_ptr, 32);
+ __lsx_vst(zero, out_ptr, 48);
+ out_ptr += 32;
+ }
+
+ out_ptr = out_arr;
+
+ /* rows: only upper-left 8x8 has non-zero coeff */
+ idct32x8_1d_rows_lsx(input, out_ptr);
+
+ /* transform columns */
+ for (i = 0; i < 4; ++i) {
+ /* process 8 * 32 block */
+ idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+}
+
+void vpx_idct32x32_1_add_lsx(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t i;
+ int16_t out;
+ __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+ out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+ out = ROUND_POWER_OF_TWO(out, 6);
+
+ vec = __lsx_vreplgr2vr_h(out);
+
+ for (i = 16; i--;) {
+ DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+ dst2 = __lsx_vldx(dst, dst_stride);
+ dst3 = __lsx_vldx(dst + 16, dst_stride);
+
+ UNPCK_UB_SH(dst0, res0, res4);
+ UNPCK_UB_SH(dst1, res1, res5);
+ UNPCK_UB_SH(dst2, res2, res6);
+ UNPCK_UB_SH(dst3, res3, res7);
+
+ DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+ res1, res2, res3);
+ DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, res4,
+ res5, res6, res7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, res4, res0, 0, res5, res1, 0, res6, res2, 0,
+ res7, res3, 0, tmp0, tmp1, tmp2, tmp3);
+ __lsx_vst(tmp0, dst, 0);
+ __lsx_vst(tmp1, dst, 16);
+ dst += dst_stride;
+ __lsx_vst(tmp2, dst, 0);
+ __lsx_vst(tmp3, dst, 16);
+ dst += dst_stride;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c
new file mode 100644
index 0000000000..f990211791
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static inline void intra_predict_dc_8x8_lsx(const uint8_t *src_top,
+ const uint8_t *src_left,
+ uint8_t *dst, int32_t dst_stride) {
+ uint64_t val0, val1;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i store, sum_h, sum_w, sum_d;
+ __m128i src = { 0 };
+
+ val0 = *(const uint64_t *)src_top;
+ val1 = *(const uint64_t *)src_left;
+ DUP2_ARG3(__lsx_vinsgr2vr_d, src, val0, 0, src, val1, 1, src, src);
+ sum_h = __lsx_vhaddw_hu_bu(src, src);
+ sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+ sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+ sum_w = __lsx_vpickev_w(sum_d, sum_d);
+ sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+ sum_w = __lsx_vsrari_w(sum_d, 4);
+ store = __lsx_vreplvei_b(sum_w, 0);
+
+ __lsx_vstelm_d(store, dst, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+ dst += dst_stride_x4;
+ __lsx_vstelm_d(store, dst, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+}
+
+static inline void intra_predict_dc_16x16_lsx(const uint8_t *src_top,
+ const uint8_t *src_left,
+ uint8_t *dst,
+ int32_t dst_stride) {
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i top, left, out;
+ __m128i sum_h, sum_top, sum_left;
+ __m128i sum_w;
+ __m128i sum_d;
+
+ DUP2_ARG2(__lsx_vld, src_top, 0, src_left, 0, top, left);
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, top, top, left, left, sum_top, sum_left);
+ sum_h = __lsx_vadd_h(sum_top, sum_left);
+ sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+ sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+ sum_w = __lsx_vpickev_w(sum_d, sum_d);
+ sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+ sum_w = __lsx_vsrari_w(sum_d, 5);
+ out = __lsx_vreplvei_b(sum_w, 0);
+
+ __lsx_vstx(out, dst, 0);
+ __lsx_vstx(out, dst, dst_stride);
+ __lsx_vstx(out, dst, dst_stride_x2);
+ __lsx_vstx(out, dst, dst_stride_x3);
+ dst += dst_stride_x4;
+ __lsx_vstx(out, dst, 0);
+ __lsx_vstx(out, dst, dst_stride);
+ __lsx_vstx(out, dst, dst_stride_x2);
+ __lsx_vstx(out, dst, dst_stride_x3);
+ dst += dst_stride_x4;
+ __lsx_vstx(out, dst, 0);
+ __lsx_vstx(out, dst, dst_stride);
+ __lsx_vstx(out, dst, dst_stride_x2);
+ __lsx_vstx(out, dst, dst_stride_x3);
+ dst += dst_stride_x4;
+ __lsx_vstx(out, dst, 0);
+ __lsx_vstx(out, dst, dst_stride);
+ __lsx_vstx(out, dst, dst_stride_x2);
+ __lsx_vstx(out, dst, dst_stride_x3);
+}
+
+void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_8x8_lsx(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ intra_predict_dc_16x16_lsx(above, left, dst, y_stride);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c
new file mode 100644
index 0000000000..0503df9966
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+#include "vpx_ports/mem.h"
+
+#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \
+ _in2, _in3, _in4, _in5, _in6, _in7) \
+ do { \
+ _in0 = __lsx_vld(_src, 0); \
+ _in1 = __lsx_vldx(_src, _stride); \
+ _in2 = __lsx_vldx(_src, _stride2); \
+ _in3 = __lsx_vldx(_src, _stride3); \
+ _src += _stride4; \
+ _in4 = __lsx_vld(_src, 0); \
+ _in5 = __lsx_vldx(_src, _stride); \
+ _in6 = __lsx_vldx(_src, _stride2); \
+ _in7 = __lsx_vldx(_src, _stride3); \
+ } while (0)
+
+#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \
+ _stride, _stride2, _stride3, _stride4) \
+ do { \
+ __lsx_vst(_dst0, _dst, 0); \
+ __lsx_vstx(_dst1, _dst, _stride); \
+ __lsx_vstx(_dst2, _dst, _stride2); \
+ __lsx_vstx(_dst3, _dst, _stride3); \
+ _dst += _stride4; \
+ __lsx_vst(_dst4, _dst, 0); \
+ __lsx_vstx(_dst5, _dst, _stride); \
+ __lsx_vstx(_dst6, _dst, _stride2); \
+ __lsx_vstx(_dst7, _dst, _stride3); \
+ } while (0)
+
+static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride,
+ uint8_t *filter48,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ __m128i flat, mask, hev, thresh, b_limit, limit;
+ __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+ __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+ __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+ __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+
+ /* load vector elements */
+ DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+ -stride, p3, p2, p1, p0);
+
+ q0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+ q3 = __lsx_vldx(dst, stride3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__lsx_bz_v(flat)) {
+ __lsx_vstx(p1_out, dst, -stride2);
+ __lsx_vstx(p0_out, dst, -stride);
+ __lsx_vst(q0_out, dst, 0);
+ __lsx_vstx(q1_out, dst, stride);
+
+ return 1;
+ }
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+ p0_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+ q3_l);
+
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+ DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+ VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+ p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+ p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l);
+ DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+ q1_filt8_l, q2_filt8_l);
+
+ /* store pixel values */
+ DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filt8_l, flat, p1_out, p1_filt8_l, flat,
+ p0_out, p0_filt8_l, flat, q0_out, q0_filt8_l, flat, p2_out, p1_out,
+ p0_out, q0_out);
+ DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filt8_l, flat, q2, q2_filt8_l, flat,
+ q1_out, q2_out);
+
+ __lsx_vst(p2_out, filter48, 0);
+ __lsx_vst(p1_out, filter48, 16);
+ __lsx_vst(p0_out, filter48, 32);
+ __lsx_vst(q0_out, filter48, 48);
+ __lsx_vst(q1_out, filter48, 64);
+ __lsx_vst(q2_out, filter48, 80);
+ __lsx_vst(flat, filter48, 96);
+
+ return 0;
+}
+
+static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+ uint8_t *dst_tmp0 = dst - stride4;
+ uint8_t *dst_tmp1 = dst + stride4;
+
+ __m128i flat, flat2, filter8;
+ __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ __m128i out_h, out_l;
+ v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+ v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+ v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+ v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+ v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+ v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+ v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+ v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+ v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+
+ flat = __lsx_vld(filter48, 96);
+
+ DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+ -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+
+ p3 = __lsx_vld(dst_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp0, stride, dst_tmp0, stride2, p2, p1);
+ p0 = __lsx_vldx(dst_tmp0, stride3);
+
+ q0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+ q3 = __lsx_vldx(dst, stride3);
+
+ q4 = __lsx_vld(dst_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+ q7 = __lsx_vldx(dst_tmp1, stride3);
+
+ VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__lsx_bz_v(flat2)) {
+ DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+ p2, p1, p0, q0);
+ DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+ __lsx_vstx(p2, dst, -stride3);
+ __lsx_vstx(p1, dst, -stride2);
+ __lsx_vstx(p0, dst, -stride);
+ __lsx_vst(q0, dst, 0);
+ __lsx_vstx(q1, dst, stride);
+ __lsx_vstx(q2, dst, stride2);
+ } else {
+ dst = dst_tmp0 - stride3;
+
+ p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+ p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+ p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+ p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+ p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+ p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+ p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+ p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+ q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
+
+ tmp0_l = p7_l_in << 3;
+ tmp0_l -= p7_l_in;
+ tmp0_l += p6_l_in;
+ tmp0_l += q0_l_in;
+ tmp1_l = p6_l_in + p5_l_in;
+ tmp1_l += p4_l_in;
+ tmp1_l += p3_l_in;
+ tmp1_l += p2_l_in;
+ tmp1_l += p1_l_in;
+ tmp1_l += p0_l_in;
+ tmp1_l += tmp0_l;
+
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+ p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+ p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+ p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+ p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+ p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+ p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+ p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+ q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
+
+ tmp0_h = p7_h_in << 3;
+ tmp0_h -= p7_h_in;
+ tmp0_h += p6_h_in;
+ tmp0_h += q0_h_in;
+ tmp1_h = p6_h_in + p5_h_in;
+ tmp1_h += p4_h_in;
+ tmp1_h += p3_h_in;
+ tmp1_h += p2_h_in;
+ tmp1_h += p1_h_in;
+ tmp1_h += p0_h_in;
+ tmp1_h += tmp0_h;
+
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+ __lsx_vst(p6, dst, 0);
+ dst += stride;
+
+ /* p5 */
+ q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
+ tmp0_l = p5_l_in - p6_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
+ tmp0_h = p5_h_in - p6_h_in;
+ tmp0_h += q1_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+ __lsx_vst(p5, dst, 0);
+ dst += stride;
+
+ /* p4 */
+ q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
+ tmp0_l = p4_l_in - p5_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
+ tmp0_h = p4_h_in - p5_h_in;
+ tmp0_h += q2_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+ __lsx_vst(p4, dst, 0);
+ dst += stride;
+
+ /* p3 */
+ q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
+ tmp0_l = p3_l_in - p4_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
+ tmp0_h = p3_h_in - p4_h_in;
+ tmp0_h += q3_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+ __lsx_vst(p3, dst, 0);
+ dst += stride;
+
+ /* p2 */
+ q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
+ filter8 = __lsx_vld(filter48, 0);
+ tmp0_l = p2_l_in - p3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
+ tmp0_h = p2_h_in - p3_h_in;
+ tmp0_h += q4_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* p1 */
+ q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
+ filter8 = __lsx_vld(filter48, 16);
+ tmp0_l = p1_l_in - p2_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
+ tmp0_h = p1_h_in - p2_h_in;
+ tmp0_h += q5_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* p0 */
+ q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
+ filter8 = __lsx_vld(filter48, 32);
+ tmp0_l = p0_l_in - p1_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
+ tmp0_h = p0_h_in - p1_h_in;
+ tmp0_h += q6_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* q0 */
+ q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
+ filter8 = __lsx_vld(filter48, 48);
+ tmp0_l = q7_l_in - p0_l_in;
+ tmp0_l += q0_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
+ tmp0_h = q7_h_in - p0_h_in;
+ tmp0_h += q0_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* q1 */
+ filter8 = __lsx_vld(filter48, 64);
+ tmp0_l = q7_l_in - q0_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p6_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q0_h_in;
+ tmp0_h += q1_h_in;
+ tmp0_h -= p6_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* q2 */
+ filter8 = __lsx_vld(filter48, 80);
+ tmp0_l = q7_l_in - q1_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p5_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q1_h_in;
+ tmp0_h += q2_h_in;
+ tmp0_h -= p5_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 0);
+ dst += stride;
+
+ /* q3 */
+ tmp0_l = q7_l_in - q2_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p4_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q2_h_in;
+ tmp0_h += q3_h_in;
+ tmp0_h -= p4_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+ __lsx_vst(q3, dst, 0);
+ dst += stride;
+
+ /* q4 */
+ tmp0_l = q7_l_in - q3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p3_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q3_h_in;
+ tmp0_h += q4_h_in;
+ tmp0_h -= p3_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+ __lsx_vst(q4, dst, 0);
+ dst += stride;
+
+ /* q5 */
+ tmp0_l = q7_l_in - q4_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p2_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q4_h_in;
+ tmp0_h += q5_h_in;
+ tmp0_h -= p2_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+ __lsx_vst(q5, dst, 0);
+ dst += stride;
+
+ /* q6 */
+ tmp0_l = q7_l_in - q5_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p1_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+ tmp0_h = q7_h_in - q5_h_in;
+ tmp0_h += q6_h_in;
+ tmp0_h -= p1_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+ __lsx_vst(q6, dst, 0);
+ }
+}
+
+static void mb_lpf_horizontal_edge_dual(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ DECLARE_ALIGNED(16, uint8_t, filter48[16 * 8]);
+ uint8_t early_exit = 0;
+
+ early_exit = hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0], b_limit_ptr,
+ limit_ptr, thresh_ptr);
+
+ if (early_exit == 0) {
+ hz_lpf_t16_16w(dst, stride, filter48);
+ }
+}
+
+static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr, int32_t count) {
+ if (count == 1) {
+ __m128i flat2, mask, hev, flat, thresh, b_limit, limit;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+ __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ __m128i p0_filter16, p1_filter16;
+ __m128i p2_filter8, p1_filter8, p0_filter8;
+ __m128i q0_filter8, q1_filter8, q2_filter8;
+ __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l;
+ __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+ __m128i zero = __lsx_vldi(0);
+ __m128i tmp0, tmp1, tmp2;
+
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = 2 + stride;
+ int32_t stride4 = stride << 2;
+ uint8_t *dst_tmp0 = dst - stride4;
+ uint8_t *dst_tmp1 = dst + stride4;
+
+ /* load vector elements */
+ DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+ -stride, p3, p2, p1, p0);
+ q0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+ q3 = __lsx_vldx(dst, stride3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ /* filter_mask* */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+ q1_out);
+ flat = __lsx_vilvl_d(zero, flat);
+ if (__lsx_bz_v(flat)) {
+ __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+ __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+ __lsx_vstelm_d(q0_out, dst, 0, 0);
+ __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+ } else {
+ /* convert 8 bit input data into 16 bit */
+ DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l,
+ p2_l, p1_l, p0_l);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l,
+ q1_l, q2_l, q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+ p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero,
+ p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8,
+ p0_filter8, q0_filter8);
+ DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8,
+ q2_filter8);
+
+ /* store pixel values */
+ p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
+ p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
+ p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
+ q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
+ q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
+ q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
+
+ /* load 16 vector elements */
+ DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+ -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+ q4 = __lsx_vld(dst_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+ q7 = __lsx_vldx(dst_tmp1, stride3);
+
+ VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+ if (__lsx_bz_v(flat2)) {
+ dst -= stride3;
+ __lsx_vstelm_d(p2_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p0_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(q0_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(q1_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(q2_out, dst, 0, 0);
+ } else {
+ /* LSB(right) 8 pixel operation */
+ DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4, p7_l,
+ p6_l, p5_l, p4_l);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7, q4_l,
+ q5_l, q6_l, q7_l);
+
+ tmp0 = __lsx_vslli_h(p7_l, 3);
+ tmp0 = __lsx_vsub_h(tmp0, p7_l);
+ tmp0 = __lsx_vadd_h(tmp0, p6_l);
+ tmp0 = __lsx_vadd_h(tmp0, q0_l);
+
+ dst = dst_tmp0 - stride3;
+
+ /* calculation of p6 and p5 */
+ tmp1 = __lsx_vadd_h(p6_l, p5_l);
+ tmp1 = __lsx_vadd_h(tmp1, p4_l);
+ tmp1 = __lsx_vadd_h(tmp1, p3_l);
+ tmp1 = __lsx_vadd_h(tmp1, p2_l);
+ tmp1 = __lsx_vadd_h(tmp1, p1_l);
+ tmp1 = __lsx_vadd_h(tmp1, p0_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp0 = __lsx_vsub_h(p5_l, p6_l);
+ tmp0 = __lsx_vadd_h(tmp0, q1_l);
+ tmp0 = __lsx_vsub_h(tmp0, p7_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, p6, p0_filter16, flat2, p5, p1_filter16,
+ flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of p4 and p3 */
+ tmp0 = __lsx_vsub_h(p4_l, p5_l);
+ tmp0 = __lsx_vadd_h(tmp0, q2_l);
+ tmp0 = __lsx_vsub_h(tmp0, p7_l);
+ tmp2 = __lsx_vsub_h(p3_l, p4_l);
+ tmp2 = __lsx_vadd_h(tmp2, q3_l);
+ tmp2 = __lsx_vsub_h(tmp2, p7_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, p4, p0_filter16, flat2, p3, p1_filter16,
+ flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of p2 and p1 */
+ tmp0 = __lsx_vsub_h(p2_l, p3_l);
+ tmp0 = __lsx_vadd_h(tmp0, q4_l);
+ tmp0 = __lsx_vsub_h(tmp0, p7_l);
+ tmp2 = __lsx_vsub_h(p1_l, p2_l);
+ tmp2 = __lsx_vadd_h(tmp2, q5_l);
+ tmp2 = __lsx_vsub_h(tmp2, p7_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, p2_out, p0_filter16, flat2, p1_out,
+ p1_filter16, flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of p0 and q0 */
+ tmp0 = __lsx_vsub_h(p0_l, p1_l);
+ tmp0 = __lsx_vadd_h(tmp0, q6_l);
+ tmp0 = __lsx_vsub_h(tmp0, p7_l);
+ tmp2 = __lsx_vsub_h(q7_l, p0_l);
+ tmp2 = __lsx_vadd_h(tmp2, q0_l);
+ tmp2 = __lsx_vsub_h(tmp2, p7_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, p0_out, p0_filter16, flat2, q0_out,
+ p1_filter16, flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of q1 and q2 */
+ tmp0 = __lsx_vsub_h(q7_l, q0_l);
+ tmp0 = __lsx_vadd_h(tmp0, q1_l);
+ tmp0 = __lsx_vsub_h(tmp0, p6_l);
+ tmp2 = __lsx_vsub_h(q7_l, q1_l);
+ tmp2 = __lsx_vadd_h(tmp2, q2_l);
+ tmp2 = __lsx_vsub_h(tmp2, p5_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, q1_out, p0_filter16, flat2, q2_out,
+ p1_filter16, flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of q3 and q4 */
+ tmp0 = __lsx_vsub_h(q7_l, q2_l);
+ tmp0 = __lsx_vadd_h(tmp0, q3_l);
+ tmp0 = __lsx_vsub_h(tmp0, p4_l);
+ tmp2 = __lsx_vsub_h(q7_l, q3_l);
+ tmp2 = __lsx_vadd_h(tmp2, q4_l);
+ tmp2 = __lsx_vsub_h(tmp2, p3_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, q3, p0_filter16, flat2, q4, p1_filter16,
+ flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ dst += stride;
+
+ /* calculation of q5 and q6 */
+ tmp0 = __lsx_vsub_h(q7_l, q4_l);
+ tmp0 = __lsx_vadd_h(tmp0, q5_l);
+ tmp0 = __lsx_vsub_h(tmp0, p2_l);
+ tmp2 = __lsx_vsub_h(q7_l, q5_l);
+ tmp2 = __lsx_vadd_h(tmp2, q6_l);
+ tmp2 = __lsx_vsub_h(tmp2, p1_l);
+ tmp1 = __lsx_vadd_h(tmp1, tmp0);
+ p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+ tmp1 = __lsx_vadd_h(tmp1, tmp2);
+ p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+ DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+ p0_filter16, p1_filter16);
+ DUP2_ARG3(__lsx_vbitsel_v, q5, p0_filter16, flat2, q6, p1_filter16,
+ flat2, p0_filter16, p1_filter16);
+ __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+ }
+ }
+ } else {
+ mb_lpf_horizontal_edge_dual(dst, stride, b_limit_ptr, limit_ptr,
+ thresh_ptr);
+ }
+}
+
+void vpx_lpf_horizontal_16_dual_lsx(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ mb_lpf_horizontal_edge(dst, stride, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output,
+ int32_t out_stride) {
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+ __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+ __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+ __m128i tmp2, tmp3;
+ __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ int32_t in_stride2 = in_stride << 1;
+ int32_t in_stride3 = in_stride2 + in_stride;
+ int32_t in_stride4 = in_stride2 << 1;
+ int32_t out_stride2 = out_stride << 1;
+ int32_t out_stride3 = out_stride2 + out_stride;
+ int32_t out_stride4 = out_stride2 << 1;
+
+ LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row0, row1,
+ row2, row3, row4, row5, row6, row7);
+ input += in_stride4;
+ LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row8, row9,
+ row10, row11, row12, row13, row14, row15);
+
+ LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p7, p6,
+ p5, p4, p3, p2, p1, p0);
+
+ /* transpose 16x8 matrix into 8x16 */
+ /* total 8 intermediate register and 32 instructions */
+ q7 = __lsx_vpackod_d(row8, row0);
+ q6 = __lsx_vpackod_d(row9, row1);
+ q5 = __lsx_vpackod_d(row10, row2);
+ q4 = __lsx_vpackod_d(row11, row3);
+ q3 = __lsx_vpackod_d(row12, row4);
+ q2 = __lsx_vpackod_d(row13, row5);
+ q1 = __lsx_vpackod_d(row14, row6);
+ q0 = __lsx_vpackod_d(row15, row7);
+
+ DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5);
+
+ DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7);
+ DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7);
+
+ DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3);
+ q0 = __lsx_vpackev_w(tmp3, tmp2);
+ q4 = __lsx_vpackod_w(tmp3, tmp2);
+
+ tmp2 = __lsx_vpackod_h(tmp1, tmp0);
+ tmp3 = __lsx_vpackod_h(q7, q5);
+ q2 = __lsx_vpackev_w(tmp3, tmp2);
+ q6 = __lsx_vpackod_w(tmp3, tmp2);
+
+ DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3);
+ q1 = __lsx_vpackev_w(tmp3, tmp2);
+ q5 = __lsx_vpackod_w(tmp3, tmp2);
+
+ tmp2 = __lsx_vpackod_h(tmp5, tmp4);
+ tmp3 = __lsx_vpackod_h(tmp7, tmp6);
+ q3 = __lsx_vpackev_w(tmp3, tmp2);
+ q7 = __lsx_vpackod_w(tmp3, tmp2);
+
+ LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride, out_stride2,
+ out_stride3, out_stride4);
+ output += out_stride4;
+ LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride, out_stride2,
+ out_stride3, out_stride4);
+}
+
+static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
+ uint8_t *dst_org, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ __m128i flat, mask, hev, thresh, b_limit, limit;
+ __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+ __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+ __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+ __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+
+ /* load vector elements */
+ DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0);
+ DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ /* if flat is zero for all pixels, then no need to calculate other filter */
+ if (__lsx_bz_v(flat)) {
+ DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ vec2 = __lsx_vilvl_h(vec1, vec0);
+ vec3 = __lsx_vilvh_h(vec1, vec0);
+ DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+ vec4 = __lsx_vilvl_h(vec1, vec0);
+ vec5 = __lsx_vilvh_h(vec1, vec0);
+
+ dst_org -= 2;
+ __lsx_vstelm_w(vec2, dst_org, 0, 0);
+ __lsx_vstelm_w(vec2, dst_org + stride, 0, 1);
+ __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2);
+ __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3);
+ dst_org += stride4;
+ __lsx_vstelm_w(vec3, dst_org, 0, 0);
+ __lsx_vstelm_w(vec3, dst_org + stride, 0, 1);
+ __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2);
+ __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3);
+ dst_org += stride4;
+ __lsx_vstelm_w(vec4, dst_org, 0, 0);
+ __lsx_vstelm_w(vec4, dst_org + stride, 0, 1);
+ __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2);
+ __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3);
+ dst_org += stride4;
+ __lsx_vstelm_w(vec5, dst_org, 0, 0);
+ __lsx_vstelm_w(vec5, dst_org + stride, 0, 1);
+ __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2);
+ __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3);
+
+ return 1;
+ }
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+ p0_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+ q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+ DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+ DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+ VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+ p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+ p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l);
+ DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+ q1_filt8_l, q2_filt8_l);
+
+ /* store pixel values */
+ p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+ p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+ p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+ q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+ q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+ q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+ __lsx_vst(p2_out, filter48, 0);
+ __lsx_vst(p1_out, filter48, 16);
+ __lsx_vst(p0_out, filter48, 32);
+ __lsx_vst(q0_out, filter48, 48);
+ __lsx_vst(q1_out, filter48, 64);
+ __lsx_vst(q2_out, filter48, 80);
+ __lsx_vst(flat, filter48, 96);
+
+ return 0;
+}
+
+static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
+ uint8_t *filter48) {
+ __m128i flat, flat2, filter8;
+ __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ __m128i out_l, out_h;
+ v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+ v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+ v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+ v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+ v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+ v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+ v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+ v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+ v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+ uint8_t *dst_tmp = dst - 128;
+
+ flat = __lsx_vld(filter48, 96);
+
+ DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, p7,
+ p6, p5, p4);
+ DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, dst_tmp, 112, p3,
+ p2, p1, p0);
+ DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+ DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
+
+ VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+ /* if flat2 is zero for all pixels, then no need to calculate other filter */
+ if (__lsx_bz_v(flat2)) {
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+ DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+ p2, p1, p0, q0);
+ DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+
+ DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
+ vec3 = __lsx_vilvl_h(vec1, vec0);
+ vec4 = __lsx_vilvh_h(vec1, vec0);
+ DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
+ vec6 = __lsx_vilvl_h(vec1, vec0);
+ vec7 = __lsx_vilvh_h(vec1, vec0);
+ vec2 = __lsx_vilvl_b(q2, q1);
+ vec5 = __lsx_vilvh_b(q2, q1);
+
+ dst_org -= 3;
+ __lsx_vstelm_w(vec3, dst_org, 0, 0);
+ __lsx_vstelm_h(vec2, dst_org, 4, 0);
+ dst_org += stride;
+ __lsx_vstelm_w(vec3, dst_org, 0, 1);
+ __lsx_vstelm_h(vec2, dst_org, 4, 1);
+ dst_org += stride;
+ __lsx_vstelm_w(vec3, dst_org, 0, 2);
+ __lsx_vstelm_h(vec2, dst_org, 4, 2);
+ dst_org += stride;
+ __lsx_vstelm_w(vec3, dst_org, 0, 3);
+ __lsx_vstelm_h(vec2, dst_org, 4, 3);
+ dst_org += stride;
+ __lsx_vstelm_w(vec4, dst_org, 0, 0);
+ __lsx_vstelm_h(vec2, dst_org, 4, 4);
+ dst_org += stride;
+ __lsx_vstelm_w(vec4, dst_org, 0, 1);
+ __lsx_vstelm_h(vec2, dst_org, 4, 5);
+ dst_org += stride;
+ __lsx_vstelm_w(vec4, dst_org, 0, 2);
+ __lsx_vstelm_h(vec2, dst_org, 4, 6);
+ dst_org += stride;
+ __lsx_vstelm_w(vec4, dst_org, 0, 3);
+ __lsx_vstelm_h(vec2, dst_org, 4, 7);
+ dst_org += stride;
+ __lsx_vstelm_w(vec6, dst_org, 0, 0);
+ __lsx_vstelm_h(vec5, dst_org, 4, 0);
+ dst_org += stride;
+ __lsx_vstelm_w(vec6, dst_org, 0, 1);
+ __lsx_vstelm_h(vec5, dst_org, 4, 1);
+ dst_org += stride;
+ __lsx_vstelm_w(vec6, dst_org, 0, 2);
+ __lsx_vstelm_h(vec5, dst_org, 4, 2);
+ dst_org += stride;
+ __lsx_vstelm_w(vec6, dst_org, 0, 3);
+ __lsx_vstelm_h(vec5, dst_org, 4, 3);
+ dst_org += stride;
+ __lsx_vstelm_w(vec7, dst_org, 0, 0);
+ __lsx_vstelm_h(vec5, dst_org, 4, 4);
+ dst_org += stride;
+ __lsx_vstelm_w(vec7, dst_org, 0, 1);
+ __lsx_vstelm_h(vec5, dst_org, 4, 5);
+ dst_org += stride;
+ __lsx_vstelm_w(vec7, dst_org, 0, 2);
+ __lsx_vstelm_h(vec5, dst_org, 4, 6);
+ dst_org += stride;
+ __lsx_vstelm_w(vec7, dst_org, 0, 3);
+ __lsx_vstelm_h(vec5, dst_org, 4, 7);
+
+ return 1;
+ }
+
+ dst -= 7 * 16;
+
+ p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+ p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+ p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+ p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+ p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+ p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+ p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+ p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+ q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
+
+ tmp0_l = p7_l_in << 3;
+ tmp0_l -= p7_l_in;
+ tmp0_l += p6_l_in;
+ tmp0_l += q0_l_in;
+ tmp1_l = p6_l_in + p5_l_in;
+ tmp1_l += p4_l_in;
+ tmp1_l += p3_l_in;
+ tmp1_l += p2_l_in;
+ tmp1_l += p1_l_in;
+ tmp1_l += p0_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+ p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+ p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+ p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+ p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+ p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+ p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+ p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+ q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
+
+ tmp0_h = p7_h_in << 3;
+ tmp0_h -= p7_h_in;
+ tmp0_h += p6_h_in;
+ tmp0_h += q0_h_in;
+ tmp1_h = p6_h_in + p5_h_in;
+ tmp1_h += p4_h_in;
+ tmp1_h += p3_h_in;
+ tmp1_h += p2_h_in;
+ tmp1_h += p1_h_in;
+ tmp1_h += p0_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+ __lsx_vst(p6, dst, 0);
+
+ /* p5 */
+ q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
+ tmp0_l = p5_l_in - p6_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
+ tmp0_h = p5_h_in - p6_h_in;
+ tmp0_h += q1_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+ __lsx_vst(p5, dst, 16);
+
+ /* p4 */
+ q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
+ tmp0_l = p4_l_in - p5_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
+ tmp0_h = p4_h_in - p5_h_in;
+ tmp0_h += q2_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+ __lsx_vst(p4, dst, 16 * 2);
+
+ /* p3 */
+ q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
+ tmp0_l = p3_l_in - p4_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
+ tmp0_h = p3_h_in - p4_h_in;
+ tmp0_h += q3_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+ __lsx_vst(p3, dst, 16 * 3);
+
+ /* p2 */
+ q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
+ filter8 = __lsx_vld(filter48, 0);
+ tmp0_l = p2_l_in - p3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
+ tmp0_h = p2_h_in - p3_h_in;
+ tmp0_h += q4_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 4);
+
+ /* p1 */
+ q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
+ filter8 = __lsx_vld(filter48, 16);
+ tmp0_l = p1_l_in - p2_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
+ tmp0_h = p1_h_in - p2_h_in;
+ tmp0_h += q5_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 5);
+
+ /* p0 */
+ q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
+ filter8 = __lsx_vld(filter48, 32);
+ tmp0_l = p0_l_in - p1_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
+ tmp0_h = p0_h_in - p1_h_in;
+ tmp0_h += q6_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 6);
+
+ /* q0 */
+ q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
+ filter8 = __lsx_vld(filter48, 48);
+ tmp0_l = q7_l_in - p0_l_in;
+ tmp0_l += q0_l_in;
+ tmp0_l -= p7_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
+ tmp0_h = q7_h_in - p0_h_in;
+ tmp0_h += q0_h_in;
+ tmp0_h -= p7_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 7);
+
+ /* q1 */
+ filter8 = __lsx_vld(filter48, 64);
+ tmp0_l = q7_l_in - q0_l_in;
+ tmp0_l += q1_l_in;
+ tmp0_l -= p6_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q0_h_in;
+ tmp0_h += q1_h_in;
+ tmp0_h -= p6_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 8);
+
+ /* q2 */
+ filter8 = __lsx_vld(filter48, 80);
+ tmp0_l = q7_l_in - q1_l_in;
+ tmp0_l += q2_l_in;
+ tmp0_l -= p5_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q1_h_in;
+ tmp0_h += q2_h_in;
+ tmp0_h -= p5_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+ __lsx_vst(filter8, dst, 16 * 9);
+
+ /* q3 */
+ tmp0_l = q7_l_in - q2_l_in;
+ tmp0_l += q3_l_in;
+ tmp0_l -= p4_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q2_h_in;
+ tmp0_h += q3_h_in;
+ tmp0_h -= p4_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+ __lsx_vst(q3, dst, 16 * 10);
+
+ /* q4 */
+ tmp0_l = q7_l_in - q3_l_in;
+ tmp0_l += q4_l_in;
+ tmp0_l -= p3_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q3_h_in;
+ tmp0_h += q4_h_in;
+ tmp0_h -= p3_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+ __lsx_vst(q4, dst, 16 * 11);
+
+ /* q5 */
+ tmp0_l = q7_l_in - q4_l_in;
+ tmp0_l += q5_l_in;
+ tmp0_l -= p2_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q4_h_in;
+ tmp0_h += q5_h_in;
+ tmp0_h -= p2_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+ __lsx_vst(q5, dst, 16 * 12);
+
+ /* q6 */
+ tmp0_l = q7_l_in - q5_l_in;
+ tmp0_l += q6_l_in;
+ tmp0_l -= p1_l_in;
+ tmp1_l += tmp0_l;
+ out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+ tmp0_h = q7_h_in - q5_h_in;
+ tmp0_h += q6_h_in;
+ tmp0_h -= p1_h_in;
+ tmp1_h += tmp0_h;
+ out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+ out_l = __lsx_vpickev_b(out_h, out_l);
+ q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+ __lsx_vst(q6, dst, 16 * 13);
+
+ return 0;
+}
+
+void vpx_lpf_vertical_16_dual_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ uint8_t early_exit = 0;
+ DECLARE_ALIGNED(16, uint8_t, transposed_input[16 * 24]);
+ uint8_t *filter48 = &transposed_input[16 * 16];
+
+ transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+ early_exit =
+ vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+ pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+ if (early_exit == 0) {
+ early_exit =
+ vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
+
+ if (early_exit == 0) {
+ transpose_16x16(transposed_input, 16, (src - 8), pitch);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c
new file mode 100644
index 0000000000..9300b5c5ae
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ __m128i mask, hev, flat, thresh, b_limit, limit;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+ int32_t pitch2 = pitch << 1;
+ int32_t pitch3 = pitch2 + pitch;
+ int32_t pitch4 = pitch2 << 1;
+
+ DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+ p3, p2, p1, p0);
+ q0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+ q3 = __lsx_vldx(src, pitch3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ __lsx_vstelm_d(p1_out, src - pitch2, 0, 0);
+ __lsx_vstelm_d(p0_out, src - pitch, 0, 0);
+ __lsx_vstelm_d(q0_out, src, 0, 0);
+ __lsx_vstelm_d(q1_out, src + pitch, 0, 0);
+}
+
+void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ __m128i mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ int32_t pitch2 = pitch << 1;
+ int32_t pitch3 = pitch2 + pitch;
+ int32_t pitch4 = pitch2 << 1;
+
+ DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+ p3, p2, p1, p0);
+ q0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+ q3 = __lsx_vldx(src, pitch3);
+
+ thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+ thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+ thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+ b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+ b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+ b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+ limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+ limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+ limit0 = __lsx_vilvl_d(limit1, limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+ __lsx_vstx(p1, src, -pitch2);
+ __lsx_vstx(p0, src, -pitch);
+ __lsx_vst(q0, src, 0);
+ __lsx_vstx(q1, src, pitch);
+}
+
+void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ __m128i mask, hev, flat, limit, thresh, b_limit;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i vec0, vec1, vec2, vec3;
+ int32_t pitch2 = pitch << 1;
+ int32_t pitch3 = pitch2 + pitch;
+ int32_t pitch4 = pitch2 << 1;
+ uint8_t *src_tmp = src - 4;
+
+ p3 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, p2, p1);
+ p0 = __lsx_vldx(src_tmp, pitch3);
+ src_tmp += pitch4;
+ q0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2);
+ q3 = __lsx_vldx(src_tmp, pitch3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+ DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1);
+ vec2 = __lsx_vilvl_h(vec1, vec0);
+ vec3 = __lsx_vilvh_h(vec1, vec0);
+
+ src -= 2;
+ __lsx_vstelm_w(vec2, src, 0, 0);
+ src += pitch;
+ __lsx_vstelm_w(vec2, src, 0, 1);
+ src += pitch;
+ __lsx_vstelm_w(vec2, src, 0, 2);
+ src += pitch;
+ __lsx_vstelm_w(vec2, src, 0, 3);
+ src += pitch;
+
+ __lsx_vstelm_w(vec3, src, 0, 0);
+ __lsx_vstelm_w(vec3, src + pitch, 0, 1);
+ __lsx_vstelm_w(vec3, src + pitch2, 0, 2);
+ __lsx_vstelm_w(vec3, src + pitch3, 0, 3);
+}
+
+void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ __m128i mask, hev, flat;
+ __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+ __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ int32_t pitch2 = pitch << 1;
+ int32_t pitch3 = pitch2 + pitch;
+ int32_t pitch4 = pitch2 << 1;
+ uint8_t *src_tmp = src - 4;
+
+ row0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row1, row2);
+ row3 = __lsx_vldx(src_tmp, pitch3);
+ src_tmp += pitch4;
+ row4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row5, row6);
+ row7 = __lsx_vldx(src_tmp, pitch3);
+ src_tmp += pitch4;
+ row8 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row9, row10);
+ row11 = __lsx_vldx(src_tmp, pitch3);
+ src_tmp += pitch4;
+ row12 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row13, row14);
+ row15 = __lsx_vldx(src_tmp, pitch3);
+
+ LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+ thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+ thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+ b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+ b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+ b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+ limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+ limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+ limit0 = __lsx_vilvl_d(limit1, limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+ DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+ tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+ DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+ tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+ src -= 2;
+ __lsx_vstelm_w(tmp2, src, 0, 0);
+ __lsx_vstelm_w(tmp2, src + pitch, 0, 1);
+ __lsx_vstelm_w(tmp2, src + pitch2, 0, 2);
+ __lsx_vstelm_w(tmp2, src + pitch3, 0, 3);
+ src += pitch4;
+ __lsx_vstelm_w(tmp3, src, 0, 0);
+ __lsx_vstelm_w(tmp3, src + pitch, 0, 1);
+ __lsx_vstelm_w(tmp3, src + pitch2, 0, 2);
+ __lsx_vstelm_w(tmp3, src + pitch3, 0, 3);
+ src += pitch4;
+ __lsx_vstelm_w(tmp4, src, 0, 0);
+ __lsx_vstelm_w(tmp4, src + pitch, 0, 1);
+ __lsx_vstelm_w(tmp4, src + pitch2, 0, 2);
+ __lsx_vstelm_w(tmp4, src + pitch3, 0, 3);
+ src += pitch4;
+ __lsx_vstelm_w(tmp5, src, 0, 0);
+ __lsx_vstelm_w(tmp5, src + pitch, 0, 1);
+ __lsx_vstelm_w(tmp5, src + pitch2, 0, 2);
+ __lsx_vstelm_w(tmp5, src + pitch3, 0, 3);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c
new file mode 100644
index 0000000000..00219ba71d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ __m128i mask, hev, flat, thresh, b_limit, limit;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p2_out, p1_out, p0_out, q0_out, q1_out;
+ __m128i p2_filter8, p1_filter8, p0_filter8;
+ __m128i q0_filter8, q1_filter8, q2_filter8;
+ __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+
+ /* load vector elements */
+ DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+ -stride, p3, p2, p1, p0);
+ q0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+ q3 = __lsx_vldx(dst, stride3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ flat = __lsx_vilvl_d(flat, flat);
+
+ if (__lsx_bz_v(flat)) {
+ __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+ __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+ __lsx_vstelm_d(q0_out, dst, 0, 0);
+ __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+ } else {
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+ p0_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+ q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+ p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+ DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8,
+ p1_filter8, q0_filter8);
+ q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8);
+
+ p2 = __lsx_vilvl_d(p1_out, p2);
+ p0_out = __lsx_vilvl_d(q0_out, p0_out);
+ q1_out = __lsx_vilvl_d(q2, q1_out);
+
+ DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat,
+ p2_out, p1_out);
+ p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat);
+ dst -= stride3;
+
+ __lsx_vstelm_d(p2_out, dst, 0, 0);
+ __lsx_vstelm_d(p2_out, dst + stride, 0, 1);
+ __lsx_vstelm_d(p1_out, dst + stride2, 0, 0);
+ __lsx_vstelm_d(p1_out, dst + stride3, 0, 1);
+
+ dst += stride4;
+ __lsx_vstelm_d(p0_out, dst, 0, 0);
+ dst += stride;
+ __lsx_vstelm_d(p0_out, dst, 0, 1);
+ }
+}
+
+void vpx_lpf_horizontal_8_dual_lsx(
+ uint8_t *dst, int32_t stride, const uint8_t *b_limit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+ __m128i flat, mask, hev, thresh, b_limit, limit;
+ __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+ __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+ __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+ __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+
+ DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+ -stride, p3, p2, p1, p0);
+ q0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+ q3 = __lsx_vldx(dst, stride3);
+
+ thresh = __lsx_vldrepl_b(thresh0, 0);
+ p2_out = __lsx_vldrepl_b(thresh1, 0);
+ thresh = __lsx_vilvl_d(p2_out, thresh);
+
+ b_limit = __lsx_vldrepl_b(b_limit0, 0);
+ p2_out = __lsx_vldrepl_b(b_limit1, 0);
+ b_limit = __lsx_vilvl_d(p2_out, b_limit);
+
+ limit = __lsx_vldrepl_b(limit0, 0);
+ p2_out = __lsx_vldrepl_b(limit1, 0);
+ limit = __lsx_vilvl_d(p2_out, limit);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ if (__lsx_bz_v(flat)) {
+ __lsx_vst(p1_out, dst - stride2, 0);
+ __lsx_vst(p0_out, dst - stride, 0);
+ __lsx_vst(q0_out, dst, 0);
+ __lsx_vst(q1_out, dst + stride, 0);
+ } else {
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+ p0_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+ q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+ DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+ DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+ VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+ p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+ p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l);
+ DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+ q1_filt8_l, q2_filt8_l);
+
+ /* store pixel values */
+ p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+ p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+ p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+ q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+ q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+ q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+ __lsx_vst(p2_out, dst - stride3, 0);
+ __lsx_vst(p1_out, dst - stride2, 0);
+ __lsx_vst(p0_out, dst - stride, 0);
+ __lsx_vst(q0_out, dst, 0);
+ __lsx_vst(q1_out, dst + stride, 0);
+ __lsx_vst(q2_out, dst + stride2, 0);
+ }
+}
+
+void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit_ptr,
+ const uint8_t *limit_ptr,
+ const uint8_t *thresh_ptr) {
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p1_out, p0_out, q0_out, q1_out;
+ __m128i flat, mask, hev, thresh, b_limit, limit;
+ __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+ __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ __m128i zero = __lsx_vldi(0);
+
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+ uint8_t *dst_tmp = dst - 4;
+
+ /* load vector elements */
+ p3 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
+ p0 = __lsx_vldx(dst_tmp, stride3);
+ dst_tmp += stride4;
+ q0 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
+ q3 = __lsx_vldx(dst_tmp, stride3);
+
+ LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+
+ thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+ b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+ limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+ flat = __lsx_vilvl_d(zero, flat);
+
+ /* if flat is zero for all pixels, then no need to calculate other filter */
+ if (__lsx_bz_v(flat)) {
+ /* Store 4 pixels p1-_q1 */
+ DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+ p2 = __lsx_vilvl_h(p1, p0);
+ p3 = __lsx_vilvh_h(p1, p0);
+
+ dst -= 2;
+ __lsx_vstelm_w(p2, dst, 0, 0);
+ __lsx_vstelm_w(p2, dst + stride, 0, 1);
+ __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+ __lsx_vstelm_w(p2, dst + stride3, 0, 3);
+ dst += stride4;
+ __lsx_vstelm_w(p3, dst, 0, 0);
+ __lsx_vstelm_w(p3, dst + stride, 0, 1);
+ __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+ __lsx_vstelm_w(p3, dst + stride3, 0, 3);
+ } else {
+ DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
+ p1_l, p0_l);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
+ q2_l, q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+ p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l);
+ DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+ q1_filt8_l, q2_filt8_l);
+ /* store pixel values */
+ p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+ p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+ p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+ q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+ q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+ q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+ /* Store 6 pixels p2-_q2 */
+ DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+ p1 = __lsx_vilvl_h(q3, p3);
+ p2 = __lsx_vilvh_h(q3, p3);
+ p3 = __lsx_vilvl_b(q2, q1);
+ dst -= 3;
+ __lsx_vstelm_w(p1, dst, 0, 0);
+ __lsx_vstelm_h(p3, dst, 4, 0);
+ dst += stride;
+ __lsx_vstelm_w(p1, dst, 0, 1);
+ __lsx_vstelm_h(p3, dst, 4, 1);
+ dst += stride;
+ __lsx_vstelm_w(p1, dst, 0, 2);
+ __lsx_vstelm_h(p3, dst, 4, 2);
+ dst += stride;
+ __lsx_vstelm_w(p1, dst, 0, 3);
+ __lsx_vstelm_h(p3, dst, 4, 3);
+ dst += stride;
+ __lsx_vstelm_w(p2, dst, 0, 0);
+ __lsx_vstelm_h(p3, dst, 4, 4);
+ dst += stride;
+ __lsx_vstelm_w(p2, dst, 0, 1);
+ __lsx_vstelm_h(p3, dst, 4, 5);
+ dst += stride;
+ __lsx_vstelm_w(p2, dst, 0, 2);
+ __lsx_vstelm_h(p3, dst, 4, 6);
+ dst += stride;
+ __lsx_vstelm_w(p2, dst, 0, 3);
+ __lsx_vstelm_h(p3, dst, 4, 7);
+ }
+}
+
+void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
+ const uint8_t *b_limit0, const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *b_limit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8_t *dst_tmp = dst - 4;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i p1_out, p0_out, q0_out, q1_out;
+ __m128i flat, mask, hev, thresh, b_limit, limit;
+ __m128i row4, row5, row6, row7, row12, row13, row14, row15;
+ __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+ __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+ __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+ __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+ __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+ __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+ int32_t stride2 = stride << 1;
+ int32_t stride3 = stride2 + stride;
+ int32_t stride4 = stride2 << 1;
+
+ p0 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
+ p3 = __lsx_vldx(dst_tmp, stride3);
+ dst_tmp += stride4;
+ row4 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
+ row7 = __lsx_vldx(dst_tmp, stride3);
+ dst_tmp += stride4;
+
+ q3 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
+ q0 = __lsx_vldx(dst_tmp, stride3);
+ dst_tmp += stride4;
+ row12 = __lsx_vld(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
+ row15 = __lsx_vldx(dst_tmp, stride3);
+
+ /* transpose 16x8 matrix into 8x16 */
+ LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+ row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+ q3);
+
+ thresh = __lsx_vldrepl_b(thresh0, 0);
+ p1_out = __lsx_vldrepl_b(thresh1, 0);
+ thresh = __lsx_vilvl_d(p1_out, thresh);
+
+ b_limit = __lsx_vldrepl_b(b_limit0, 0);
+ p1_out = __lsx_vldrepl_b(b_limit1, 0);
+ b_limit = __lsx_vilvl_d(p1_out, b_limit);
+
+ limit = __lsx_vldrepl_b(limit0, 0);
+ p1_out = __lsx_vldrepl_b(limit1, 0);
+ limit = __lsx_vilvl_d(p1_out, limit);
+
+ /* mask and hev */
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ /* flat4 */
+ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+ /* filter4 */
+ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+ /* if flat is zero for all pixels, then no need to calculate other filter */
+ if (__lsx_bz_v(flat)) {
+ DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+ p2 = __lsx_vilvl_h(p1, p0);
+ p3 = __lsx_vilvh_h(p1, p0);
+ DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+ q2 = __lsx_vilvl_h(p1, p0);
+ q3 = __lsx_vilvh_h(p1, p0);
+ dst -= 2;
+ __lsx_vstelm_w(p2, dst, 0, 0);
+ __lsx_vstelm_w(p2, dst + stride, 0, 1);
+ __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+ __lsx_vstelm_w(p2, dst + stride3, 0, 3);
+ dst += stride4;
+ __lsx_vstelm_w(p3, dst, 0, 0);
+ __lsx_vstelm_w(p3, dst + stride, 0, 1);
+ __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+ __lsx_vstelm_w(p3, dst + stride3, 0, 3);
+ dst += stride4;
+ __lsx_vstelm_w(q2, dst, 0, 0);
+ __lsx_vstelm_w(q2, dst + stride, 0, 1);
+ __lsx_vstelm_w(q2, dst + stride2, 0, 2);
+ __lsx_vstelm_w(q2, dst + stride3, 0, 3);
+ dst += stride4;
+ __lsx_vstelm_w(q3, dst, 0, 0);
+ __lsx_vstelm_w(q3, dst + stride, 0, 1);
+ __lsx_vstelm_w(q3, dst + stride2, 0, 2);
+ __lsx_vstelm_w(q3, dst + stride3, 0, 3);
+ } else {
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+ p0_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+ q3_l);
+ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+ DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+ DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+
+ /* filter8 */
+ VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+ p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+ /* convert 16 bit output data into 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+ p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+ p1_filt8_l, p0_filt8_l, q0_filt8_l);
+ DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+ q1_filt8_l, q2_filt8_l);
+
+ /* store pixel values */
+ p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+ p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+ p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+ q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+ q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+ q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+ DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+ p2_filt8_l = __lsx_vilvl_h(q3, p3);
+ p2_filt8_h = __lsx_vilvh_h(q3, p3);
+ DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3);
+ p0_filt8_l = __lsx_vilvl_h(q3, p3);
+ p0_filt8_h = __lsx_vilvh_h(q3, p3);
+ q1_filt8_l = __lsx_vilvl_b(q2, q1);
+ q1_filt8_h = __lsx_vilvh_b(q2, q1);
+
+ dst -= 3;
+ __lsx_vstelm_w(p2_filt8_l, dst, 0, 0);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 0);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_l, dst, 0, 1);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 1);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_l, dst, 0, 2);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 2);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_l, dst, 0, 3);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 3);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_h, dst, 0, 0);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 4);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_h, dst, 0, 1);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 5);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_h, dst, 0, 2);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 6);
+ dst += stride;
+ __lsx_vstelm_w(p2_filt8_h, dst, 0, 3);
+ __lsx_vstelm_h(q1_filt8_l, dst, 4, 7);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_l, dst, 0, 0);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 0);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_l, dst, 0, 1);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 1);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_l, dst, 0, 2);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 2);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_l, dst, 0, 3);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 3);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_h, dst, 0, 0);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 4);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_h, dst, 0, 1);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 5);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_h, dst, 0, 2);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 6);
+ dst += stride;
+ __lsx_vstelm_w(p0_filt8_h, dst, 0, 3);
+ __lsx_vstelm_h(q1_filt8_h, dst, 4, 7);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h
new file mode 100644
index 0000000000..1c43836503
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \
+ flat_out) \
+ do { \
+ __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
+ __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
+ \
+ /* absolute subtraction of pixel values */ \
+ p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in); \
+ p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in); \
+ p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in); \
+ q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in); \
+ q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in); \
+ q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in); \
+ p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in); \
+ p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in); \
+ \
+ /* calculation of hev */ \
+ flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m); \
+ hev_out = __lsx_vslt_bu(thresh_in, flat_out); \
+ \
+ /* calculation of mask */ \
+ p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m); \
+ p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1); \
+ p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m); \
+ mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m); \
+ mask_out = __lsx_vmax_bu(flat_out, mask_out); \
+ p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m); \
+ mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out); \
+ q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m); \
+ mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \
+ \
+ mask_out = __lsx_vslt_bu(limit_in, mask_out); \
+ mask_out = __lsx_vxori_b(mask_out, 0xff); \
+ } while (0)
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
+ do { \
+ __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0; \
+ __m128i flat4_tmp = __lsx_vldi(1); \
+ \
+ DUP4_ARG2(__lsx_vabsd_bu, p2_in, p0_in, q2_in, q0_in, p3_in, p0_in, q3_in, \
+ q0_in, p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0); \
+ p2_asub_p0 = __lsx_vmax_bu(p2_asub_p0, q2_asub_q0); \
+ flat_out = __lsx_vmax_bu(p2_asub_p0, flat_out); \
+ p3_asub_p0 = __lsx_vmax_bu(p3_asub_p0, q3_asub_q0); \
+ flat_out = __lsx_vmax_bu(p3_asub_p0, flat_out); \
+ \
+ flat_out = __lsx_vslt_bu(flat4_tmp, flat_out); \
+ flat_out = __lsx_vxori_b(flat_out, 0xff); \
+ flat_out = flat_out & (mask); \
+ } while (0)
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
+ q6_in, q7_in, flat_in, flat2_out) \
+ do { \
+ __m128i flat5_tmp = __lsx_vldi(1); \
+ __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0; \
+ __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0; \
+ DUP4_ARG2(__lsx_vabsd_bu, p4_in, p0_in, q4_in, q0_in, p5_in, p0_in, q5_in, \
+ q0_in, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0); \
+ DUP4_ARG2(__lsx_vabsd_bu, p6_in, p0_in, q6_in, q0_in, p7_in, p0_in, q7_in, \
+ q0_in, p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0); \
+ \
+ DUP2_ARG2(__lsx_vmax_bu, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0, \
+ p4_asub_p0, flat2_out); \
+ flat2_out = __lsx_vmax_bu(p4_asub_p0, flat2_out); \
+ p6_asub_p0 = __lsx_vmax_bu(p6_asub_p0, q6_asub_q0); \
+ flat2_out = __lsx_vmax_bu(p6_asub_p0, flat2_out); \
+ p7_asub_p0 = __lsx_vmax_bu(p7_asub_p0, q7_asub_q0); \
+ flat2_out = __lsx_vmax_bu(p7_asub_p0, flat2_out); \
+ flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out); \
+ flat2_out = __lsx_vxori_b(flat2_out, 0xff); \
+ flat2_out = flat2_out & flat_in; \
+ } while (0)
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \
+ p0_out, q0_out, q1_out) \
+ do { \
+ __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+ const __m128i cnst4b = __lsx_vldi(4); \
+ const __m128i cnst3b = __lsx_vldi(3); \
+ DUP4_ARG2(__lsx_vxori_b, p1_in, 0x80, p0_in, 0x80, q0_in, 0x80, q1_in, \
+ 0x80, p1_m, p0_m, q0_m, q1_m); \
+ filt = __lsx_vssub_b(p1_m, q1_m); \
+ filt &= hev; \
+ \
+ q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt &= mask; \
+ DUP2_ARG2(__lsx_vsadd_b, filt, cnst4b, filt, cnst3b, t1, t2); \
+ DUP2_ARG2(__lsx_vsrai_b, t1, 3, t2, 3, t1, t2); \
+ \
+ q0_m = __lsx_vssub_b(q0_m, t1); \
+ p0_m = __lsx_vsadd_b(p0_m, t2); \
+ DUP2_ARG2(__lsx_vxori_b, q0_m, 0x80, p0_m, 0x80, q0_out, p0_out); \
+ \
+ filt = __lsx_vsrari_b(t1, 1); \
+ hev = __lsx_vxori_b(hev, 0xff); \
+ filt &= hev; \
+ q1_m = __lsx_vssub_b(q1_m, filt); \
+ p1_m = __lsx_vsadd_b(p1_m, filt); \
+ DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out); \
+ } while (0)
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+ q1_filt8_out, q2_filt8_out) \
+ do { \
+ __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \
+ \
+ tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in); \
+ tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, p0_in); \
+ tmp_filt8_0 = __lsx_vslli_h(p3_in, 1); \
+ \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_2); \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, q0_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p3_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, p2_in); \
+ p2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p1_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q1_in); \
+ p1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = __lsx_vadd_h(q2_in, q1_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q0_in); \
+ tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, tmp_filt8_1); \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, p0_in); \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, p3_in); \
+ p0_filt8_out = __lsx_vsrari_h(tmp_filt8_0, 3); \
+ \
+ tmp_filt8_0 = __lsx_vadd_h(q2_in, q3_in); \
+ tmp_filt8_0 = __lsx_vadd_h(p0_in, tmp_filt8_0); \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \
+ tmp_filt8_1 = __lsx_vadd_h(q3_in, q3_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, tmp_filt8_0); \
+ q2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \
+ \
+ tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, q3_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, q0_in); \
+ q0_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \
+ \
+ tmp_filt8_1 = __lsx_vsub_h(tmp_filt8_0, p2_in); \
+ tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in); \
+ tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \
+ q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \
+ } while (0)
+
+#endif // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c
new file mode 100644
index 0000000000..77be0bb4fe
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_intrin_lsx.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs,
+ __m128i round, __m128i quant,
+ __m128i shift, __m128i cmp_mask) {
+ __m128i rounded, qcoeff;
+
+ rounded = __lsx_vsadd_h(coeff_abs, round);
+ qcoeff = __lsx_vmuh_h(rounded, quant);
+ qcoeff = __lsx_vadd_h(rounded, qcoeff);
+ qcoeff = __lsx_vmuh_h(qcoeff, shift);
+ qcoeff = __lsx_vsigncov_h(coeff, qcoeff);
+ qcoeff = __lsx_vand_v(qcoeff, cmp_mask);
+
+ return qcoeff;
+}
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+ int16_t *dqcoeff) {
+ __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant);
+ __lsx_vst(dqcoeff16, dqcoeff, 0);
+}
+
+static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
+ __m128i dequant,
+ int16_t *dqcoeff) {
+ // Un-sign to bias rounding like C.
+ __m128i low, high, dqcoeff32_0, dqcoeff32_1, res;
+ __m128i zero = __lsx_vldi(0);
+ __m128i coeff = __lsx_vabsd_h(qcoeff, zero);
+
+ const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero);
+ const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero);
+
+ low = __lsx_vmul_h(coeff, dequant);
+ high = __lsx_vmuh_h(coeff, dequant);
+ dqcoeff32_0 = __lsx_vilvl_h(high, low);
+ dqcoeff32_1 = __lsx_vilvh_h(high, low);
+
+ // "Divide" by 2.
+ dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1);
+ dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1);
+ dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0);
+ dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1);
+ res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0);
+ __lsx_vst(res, dqcoeff, 0);
+}
+
+static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
+ const int16_t *scan, int index,
+ __m128i zero) {
+ const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
+ const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero);
+ __m128i scan0 = __lsx_vld(scan + index, 0);
+ __m128i scan1 = __lsx_vld(scan + index + 8, 0);
+ __m128i eob0, eob1;
+
+ eob0 = __lsx_vandn_v(zero_coeff0, scan0);
+ eob1 = __lsx_vandn_v(zero_coeff1, scan1);
+ return __lsx_vmax_h(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+ __m128i eob_shuffled;
+ int16_t res_m;
+
+ eob_shuffled = __lsx_vshuf4i_w(eob, 0xe);
+ eob = __lsx_vmax_h(eob, eob_shuffled);
+ eob_shuffled = __lsx_vshuf4i_h(eob, 0xe);
+ eob = __lsx_vmax_h(eob, eob_shuffled);
+ eob_shuffled = __lsx_vshuf4i_h(eob, 0x1);
+ eob = __lsx_vmax_h(eob, eob_shuffled);
+ res_m = __lsx_vpickve2gr_h(eob, 1);
+
+ return res_m;
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ __m128i zero = __lsx_vldi(0);
+ int index = 16;
+
+ __m128i zbin, round, quant, dequant, quant_shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
+
+ (void)scan;
+
+ zbin = __lsx_vld(zbin_ptr, 0);
+ round = __lsx_vld(round_ptr, 0);
+ quant = __lsx_vld(quant_ptr, 0);
+ dequant = __lsx_vld(dequant_ptr, 0);
+ quant_shift = __lsx_vld(quant_shift_ptr, 0);
+ // Handle one DC and first 15 AC.
+ DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+ qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+ qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+ cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+ zbin = __lsx_vilvh_d(zbin, zbin);
+ cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+ qcoeff0 =
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ round = __lsx_vilvh_d(round, round);
+ quant = __lsx_vilvh_d(quant, quant);
+ quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+ qcoeff1 =
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+ __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+ __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = __lsx_vilvh_d(dequant, dequant);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = __lsx_vld(coeff_ptr + index, 0);
+ coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+ qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+ qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+ cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+ cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+ qcoeff0 =
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ qcoeff1 =
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+ __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+ __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
+ eob = __lsx_vmax_h(eob, eob0);
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m128i zero = __lsx_vldi(0);
+ int index;
+
+ __m128i zbin, round, quant, dequant, quant_shift;
+ __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1;
+ __m128i eob = zero, eob0;
+
+ (void)scan;
+ (void)n_coeffs;
+
+ zbin = __lsx_vld(zbin_ptr, 0);
+ zbin = __lsx_vsrari_h(zbin, 1);
+ round = __lsx_vld(round_ptr, 0);
+ round = __lsx_vsrari_h(round, 1);
+
+ quant = __lsx_vld(quant_ptr, 0);
+ dequant = __lsx_vld(dequant_ptr, 0);
+ quant_shift = __lsx_vld(quant_shift_ptr, 0);
+ quant_shift = __lsx_vslli_h(quant_shift, 1);
+ // Handle one DC and first 15 AC.
+ DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+ qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+ qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+ cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+ // remove DC from zbin
+ zbin = __lsx_vilvh_d(zbin, zbin);
+ cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+ qcoeff0 =
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ // remove DC in quant_shift, quant, quant_shift
+ round = __lsx_vilvh_d(round, round);
+ quant = __lsx_vilvh_d(quant, quant);
+ quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+ qcoeff1 =
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+ __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+ __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = __lsx_vilvh_d(dequant, dequant);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
+ eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
+ // AC only loop.
+ for (index = 16; index < 32 * 32; index += 16) {
+ coeff0 = __lsx_vld(coeff_ptr + index, 0);
+ coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+ qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+ qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+ cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+ cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+ qcoeff0 =
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ qcoeff1 =
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+ __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+ __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
+ dqcoeff_ptr + 8 + index);
+ eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
+ eob = __lsx_vmax_h(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c
new file mode 100644
index 0000000000..b6fbedb0d0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c
@@ -0,0 +1,717 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0,
+ __m128i ref1) {
+ __m128i diff0_m, diff1_m, sad_m0;
+ __m128i sad_m = __lsx_vldi(0);
+
+ diff0_m = __lsx_vabsd_bu(in0, ref0);
+ diff1_m = __lsx_vabsd_bu(in1, ref1);
+
+ sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m);
+ sad_m = __lsx_vadd_h(sad_m, sad_m0);
+ sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m);
+ sad_m = __lsx_vadd_h(sad_m, sad_m0);
+
+ return sad_m;
+}
+
+static INLINE uint32_t hadd_uw_u32(__m128i in) {
+ __m128i res0_m;
+ uint32_t sum_m;
+
+ res0_m = __lsx_vhaddw_du_wu(in, in);
+ res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m);
+ sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+ return sum_m;
+}
+
+static INLINE uint32_t hadd_uh_u32(__m128i in) {
+ __m128i res_m;
+ uint32_t sum_m;
+
+ res_m = __lsx_vhaddw_wu_hu(in, in);
+ sum_m = hadd_uw_u32(res_m);
+
+ return sum_m;
+}
+
+static INLINE int32_t hadd_sw_s32(__m128i in) {
+ __m128i res0_m;
+ int32_t sum_m;
+
+ res0_m = __lsx_vhaddw_d_w(in, in);
+ res0_m = __lsx_vhaddw_q_d(res0_m, res0_m);
+ sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+ return sum_m;
+}
+
+static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt;
+ uint32_t res;
+ __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
+ __m128i sad = __lsx_vldi(0);
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+ src += src_stride;
+ ref += ref_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1);
+ src += src_stride;
+ ref += ref_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2);
+ src += src_stride;
+ ref += ref_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3);
+ src += src_stride;
+ ref += ref_stride;
+ DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+ src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ }
+ res = hadd_uh_u32(sad);
+ return res;
+}
+
+static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt = (height >> 2);
+ uint32_t res;
+ __m128i src0, src1, ref0, ref1, sad_tmp;
+ __m128i sad = __lsx_vldi(0);
+ int32_t src_stride2 = src_stride << 1;
+ int32_t ref_stride2 = ref_stride << 1;
+
+ for (; ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+ src += src_stride2;
+ ref += ref_stride2;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+ src += src_stride2;
+ ref += ref_stride2;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ }
+
+ res = hadd_uh_u32(sad);
+ return res;
+}
+
+static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt = (height >> 2);
+ uint32_t res;
+ __m128i src0, src1, ref0, ref1;
+ __m128i sad_tmp;
+ __m128i sad = __lsx_vldi(0);
+
+ for (; ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ }
+ res = hadd_uh_u32(sad);
+ return res;
+}
+
+static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height) {
+ int32_t ht_cnt = (height >> 1);
+ uint32_t sad = 0;
+ __m128i src0, src1, src2, src3;
+ __m128i ref0, ref1, ref2, ref3;
+ __m128i sad_tmp;
+ __m128i sad0 = __lsx_vldi(0);
+ __m128i sad1 = sad0;
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+ }
+
+ sad = hadd_uh_u32(sad0);
+ sad += hadd_uh_u32(sad1);
+
+ return sad;
+}
+
+static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ int32_t ht_cnt = (height >> 2);
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ __m128i src0, src1, src2, src3, sad_tmp;
+ __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+ __m128i sad0 = __lsx_vldi(0);
+ __m128i sad1 = sad0;
+ __m128i sad2 = sad0;
+ __m128i sad3 = sad0;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t ref_stride2 = ref_stride << 1;
+ int32_t ref_stride3 = ref_stride2 + ref_stride;
+ int32_t ref_stride4 = ref_stride2 << 1;
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (; ht_cnt--;) {
+ src0 = __lsx_vld(src_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1,
+ src2);
+ src3 = __lsx_vldx(src_ptr, src_stride3);
+ src_ptr += src_stride4;
+ ref0 = __lsx_vld(ref0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1,
+ ref2);
+ ref3 = __lsx_vldx(ref0_ptr, ref_stride3);
+ ref0_ptr += ref_stride4;
+ ref4 = __lsx_vld(ref1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5,
+ ref6);
+ ref7 = __lsx_vldx(ref1_ptr, ref_stride3);
+ ref1_ptr += ref_stride4;
+ ref8 = __lsx_vld(ref2_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9,
+ ref10);
+ ref11 = __lsx_vldx(ref2_ptr, ref_stride3);
+ ref2_ptr += ref_stride4;
+ ref12 = __lsx_vld(ref3_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13,
+ ref14);
+ ref15 = __lsx_vldx(ref3_ptr, ref_stride3);
+ ref3_ptr += ref_stride4;
+
+ DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1);
+ DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+ DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+ DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad3 = __lsx_vadd_h(sad3, sad_tmp);
+ }
+ sad_array[0] = hadd_uh_u32(sad0);
+ sad_array[1] = hadd_uh_u32(sad1);
+ sad_array[2] = hadd_uh_u32(sad2);
+ sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ int32_t ht_cnt = (height >> 1);
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp;
+ __m128i sad0 = __lsx_vldi(0);
+ __m128i sad1 = sad0;
+ __m128i sad2 = sad0;
+ __m128i sad3 = sad0;
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (; ht_cnt--;) {
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref0 = __lsx_vld(ref0_ptr, 0);
+ ref0_ptr += ref_stride;
+ ref1 = __lsx_vld(ref1_ptr, 0);
+ ref1_ptr += ref_stride;
+ ref2 = __lsx_vld(ref2_ptr, 0);
+ ref2_ptr += ref_stride;
+ ref3 = __lsx_vld(ref3_ptr, 0);
+ ref3_ptr += ref_stride;
+
+ diff = __lsx_vabsd_bu(src, ref0);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref1);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref2);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad2 = __lsx_vadd_h(sad2, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref3);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad3 = __lsx_vadd_h(sad3, sad_tmp);
+
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref0 = __lsx_vld(ref0_ptr, 0);
+ ref0_ptr += ref_stride;
+ ref1 = __lsx_vld(ref1_ptr, 0);
+ ref1_ptr += ref_stride;
+ ref2 = __lsx_vld(ref2_ptr, 0);
+ ref2_ptr += ref_stride;
+ ref3 = __lsx_vld(ref3_ptr, 0);
+ ref3_ptr += ref_stride;
+
+ diff = __lsx_vabsd_bu(src, ref0);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref1);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref2);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad2 = __lsx_vadd_h(sad2, sad_tmp);
+ diff = __lsx_vabsd_bu(src, ref3);
+ sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+ sad3 = __lsx_vadd_h(sad3, sad_tmp);
+ }
+ sad_array[0] = hadd_uh_u32(sad0);
+ sad_array[1] = hadd_uh_u32(sad1);
+ sad_array[2] = hadd_uh_u32(sad2);
+ sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ int32_t ht_cnt = height;
+ __m128i src0, src1, ref0, ref1, sad_tmp;
+ __m128i sad0 = __lsx_vldi(0);
+ __m128i sad1 = sad0;
+ __m128i sad2 = sad0;
+ __m128i sad3 = sad0;
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (; ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+
+ DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1);
+ ref0_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1);
+ ref1_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1);
+ ref2_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+ DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1);
+ ref3_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad3 = __lsx_vadd_h(sad3, sad_tmp);
+ }
+ sad_array[0] = hadd_uh_u32(sad0);
+ sad_array[1] = hadd_uh_u32(sad1);
+ sad_array[2] = hadd_uh_u32(sad2);
+ sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *const aref_ptr[],
+ int32_t ref_stride, int32_t height,
+ uint32_t *sad_array) {
+ const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+ int32_t ht_cnt = height;
+ __m128i src0, src1, src2, src3;
+ __m128i ref0, ref1, ref2, ref3;
+ __m128i sad, sad_tmp;
+
+ __m128i sad0_0 = __lsx_vldi(0);
+ __m128i sad0_1 = sad0_0;
+ __m128i sad1_0 = sad0_0;
+ __m128i sad1_1 = sad0_0;
+ __m128i sad2_0 = sad0_0;
+ __m128i sad2_1 = sad0_0;
+ __m128i sad3_0 = sad0_0;
+ __m128i sad3_1 = sad0_0;
+
+ ref0_ptr = aref_ptr[0];
+ ref1_ptr = aref_ptr[1];
+ ref2_ptr = aref_ptr[2];
+ ref3_ptr = aref_ptr[3];
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref0_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref1_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref2_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref3_ptr += ref_stride;
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+ sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+ sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp);
+ }
+ sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0);
+ sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1);
+ sad = __lsx_vadd_w(sad, sad_tmp);
+ sad_array[0] = hadd_uw_u32(sad);
+
+ sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0);
+ sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1);
+ sad = __lsx_vadd_w(sad, sad_tmp);
+ sad_array[1] = hadd_uw_u32(sad);
+
+ sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0);
+ sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1);
+ sad = __lsx_vadd_w(sad, sad_tmp);
+ sad_array[2] = hadd_uw_u32(sad);
+
+ sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0);
+ sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1);
+ sad = __lsx_vadd_w(sad, sad_tmp);
+ sad_array[3] = hadd_uw_u32(sad);
+}
+
+static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ __m128i comp0, comp1, sad_tmp;
+ __m128i sad = __lsx_vldi(0);
+ uint8_t *src_tmp, *ref_tmp;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t ref_stride2 = ref_stride << 1;
+ int32_t ref_stride3 = ref_stride2 + ref_stride;
+ int32_t ref_stride4 = ref_stride2 << 1;
+
+ for (; ht_cnt--;) {
+ src_tmp = (uint8_t *)src + 16;
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+ src1 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp, src_stride3);
+ src += src_stride4;
+
+ ref_tmp = (uint8_t *)ref + 16;
+ ref0 = __lsx_vld(ref, 0);
+ DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4);
+ ref6 = __lsx_vldx(ref, ref_stride3);
+ ref1 = __lsx_vld(ref_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3,
+ ref5);
+ ref7 = __lsx_vldx(ref_tmp, ref_stride3);
+ ref += ref_stride4;
+
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96,
+ pred0, pred2, pred4, pred6);
+ DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred,
+ 112, pred1, pred3, pred5, pred7);
+ sec_pred += 128;
+
+ DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1);
+ sad = __lsx_vadd_h(sad, sad_tmp);
+ }
+ res = hadd_uh_u32(sad);
+ return res;
+}
+
+static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ int32_t height, const uint8_t *sec_pred) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3;
+ __m128i sad, sad_tmp;
+ __m128i sad0 = __lsx_vldi(0);
+ __m128i sad1 = sad0;
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+ ref3, comp0, comp1, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+ ref3, comp0, comp1, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+ ref3, comp0, comp1, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+ ref3);
+ ref += ref_stride;
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+ ref3, comp0, comp1, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+ sad0 = __lsx_vadd_h(sad0, sad_tmp);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+ sad1 = __lsx_vadd_h(sad1, sad_tmp);
+ }
+ sad = __lsx_vhaddw_wu_hu(sad0, sad0);
+ sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1);
+ sad = __lsx_vadd_w(sad, sad_tmp);
+
+ res = hadd_sw_s32(sad);
+ return res;
+}
+
+#define VPX_SAD_8xHT_LSX(height) \
+ uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_8width_lsx(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_16xHT_LSX(height) \
+ uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_16width_lsx(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_32xHT_LSX(height) \
+ uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_32width_lsx(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_64xHT_LSX(height) \
+ uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *ref, int32_t ref_stride) { \
+ return sad_64width_lsx(src, src_stride, ref, ref_stride, height); \
+ }
+
+#define VPX_SAD_8xHTx4D_LSX(height) \
+ void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
+ sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_SAD_16xHTx4D_LSX(height) \
+ void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_SAD_32xHTx4D_LSX(height) \
+ void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_SAD_64xHTx4D_LSX(height) \
+ void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+ const uint8_t *const refs[], \
+ int32_t ref_stride, uint32_t *sads) { \
+ sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
+ }
+
+#define VPX_AVGSAD_32xHT_LSX(height) \
+ uint32_t vpx_sad32x##height##_avg_lsx( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define VPX_AVGSAD_64xHT_LSX(height) \
+ uint32_t vpx_sad64x##height##_avg_lsx( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, const uint8_t *second_pred) { \
+ return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \
+ second_pred); \
+ }
+
+#define SAD64 \
+ VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \
+ VPX_AVGSAD_64xHT_LSX(64)
+
+SAD64
+
+#define SAD32 \
+ VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \
+ VPX_AVGSAD_32xHT_LSX(32)
+
+SAD32
+
+#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16)
+
+SAD16
+
+#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8)
+
+SAD8
+
+#undef SAD64
+#undef SAD32
+#undef SAD16
+#undef SAD8
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
new file mode 100644
index 0000000000..700793531c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
@@ -0,0 +1,874 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/loongarch/variance_lsx.h"
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters_lsx[8][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+#define VARIANCE_WxH(sse, diff, shift) \
+ (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+ (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr,
+ int32_t src_stride,
+ const uint8_t *ref_ptr,
+ int32_t ref_stride,
+ const uint8_t *sec_pred, int32_t *diff) {
+ int32_t res, ht_cnt = 32;
+ __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ __m128i pred0, pred1, pred2, pred3, vec, vec_tmp;
+ __m128i avg0, avg1, avg2, avg3;
+ __m128i var = __lsx_vldi(0);
+
+ avg0 = var;
+ avg1 = var;
+ avg2 = var;
+ avg3 = var;
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ src_ptr += src_stride;
+ DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+
+ DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+ pred3, src0, src1, src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+ DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+ pred0, pred1, pred2, pred3);
+ sec_pred += 64;
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ src_ptr += src_stride;
+ DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+
+ DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+ pred3, src0, src1, src2, src3);
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ }
+ vec = __lsx_vhaddw_w_h(avg0, avg0);
+ vec_tmp = __lsx_vhaddw_w_h(avg1, avg1);
+ vec = __lsx_vadd_w(vec, vec_tmp);
+ vec_tmp = __lsx_vhaddw_w_h(avg2, avg2);
+ vec = __lsx_vadd_w(vec, vec_tmp);
+ vec_tmp = __lsx_vhaddw_w_h(avg3, avg3);
+ vec = __lsx_vadd_w(vec, vec_tmp);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ __m128i vec0, vec1, vec2, vec3, filt0, out, vec;
+ __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ ref0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+ ref3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+
+ DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, vec0, vec0, FILTER_BITS, vec1, vec1,
+ FILTER_BITS, vec2, vec2, FILTER_BITS, vec3, vec3, FILTER_BITS,
+ src0, src1, src2, src3);
+ out = __lsx_vpackev_d(src1, src0);
+ CALC_MSE_AVG_B(out, ref0, var, avg);
+ out = __lsx_vpackev_d(src3, src2);
+ CALC_MSE_AVG_B(out, ref1, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i dst0, dst1, dst2, dst3, filt0;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i vec, var = __lsx_vldi(0);
+ __m128i avg = var;
+ __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+ src += src_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ src0, src1, src2, src3);
+ CALC_MSE_AVG_B(src0, dst0, var, avg);
+ CALC_MSE_AVG_B(src1, dst1, var, avg);
+ CALC_MSE_AVG_B(src2, dst2, var, avg);
+ CALC_MSE_AVG_B(src3, dst3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t sse = 0;
+ int32_t diff0[2];
+
+ sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[0]);
+ src += 16;
+ dst += 16;
+
+ sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[1]);
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+ __m128i vec, vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3, filt0;
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ ref0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+ ref3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+
+ DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+ vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ src0 = src4;
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+ __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i var = __lsx_vldi(0);
+ __m128i avg = var;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ ref0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+ ref3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ src0 = src4;
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+ uint32_t sse = 0;
+ int32_t diff0[2];
+
+ sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[0]);
+ src += 16;
+ dst += 16;
+
+ sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[1]);
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4, out0, out1;
+ __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3, vec, vec0, filt_hz, filt_vt;
+ __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+ HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src1, ref0);
+ src += src_stride;
+ dst += dst_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src2, ref1);
+ src += src_stride;
+ dst += dst_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src3, ref2);
+ src += src_stride;
+ dst += dst_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src4, ref3);
+ src += src_stride;
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+ HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out1);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out1);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out0);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, out0, out1);
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec;
+ __m128i var = __lsx_vldi(0);
+ __m128i avg = var;
+ __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+
+ HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+ src += src_stride;
+
+ ref0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+ ref3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+
+ HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+ HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ src0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ src1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+ HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ src2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ src3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ CALC_MSE_AVG_B(src2, ref2, var, avg);
+ CALC_MSE_AVG_B(src3, ref3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+
+ return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+ int32_t height, int32_t *diff) {
+ uint32_t sse = 0;
+ int32_t diff0[2];
+
+ sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height,
+ &diff0[0]);
+ src += 16;
+ dst += 16;
+
+ sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height,
+ &diff0[1]);
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff, int32_t width) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+ __m128i pred0, pred1, pred2, pred3, filt0, vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i mask = { 0x403030202010100, 0x807070606050504 };
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+ src += src_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ dst1 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ dst2 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ dst3 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+
+ pred0 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred1 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred2 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred3 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vavgr_bu, tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3,
+ pred3, tmp0, tmp1, tmp2, tmp3);
+
+ CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+ CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+ CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+ CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+
+ return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff, int32_t width) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+ __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i tmp0, tmp1, vec, filt0;
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ src += src_stride;
+ src2 = __lsx_vld(src, 0);
+ src += src_stride;
+ src3 = __lsx_vld(src, 0);
+ src += src_stride;
+ src4 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ pred0 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred1 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred2 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred3 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ src0 = src4;
+ ref0 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref1 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref2 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref3 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+ pred3, out0, out1, out2, out3);
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
+ uint32_t loop_cnt = (height >> 2);
+ int32_t res;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+ __m128i out0, out1, out2, out3, filt_hz, filt_vt, vec, vec0, vec1;
+ __m128i mask = { 0x403030202010100, 0x807070606050504 };
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+
+ HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+ src += src_stride;
+
+ pred0 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred1 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred2 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+ pred3 = __lsx_vld(sec_pred, 0);
+ sec_pred += width;
+
+ HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+ HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+ HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+ HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ ref0 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref1 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref2 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+ ref3 = __lsx_vld(dst, 0);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+ pred3, out0, out1, out2, out3);
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_h_lsx(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+ int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse +=
+ subpel_avg_ssediff_16w_v_lsx(src, src_stride, dst, dst_stride, sec_pred,
+ filter, height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx(
+ const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+ int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+ const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += subpel_avg_ssediff_16w_hv_lsx(src, src_stride, dst, dst_stride,
+ sec_pred, filter_horiz, filter_vert,
+ height, &diff0[loop_cnt], 64);
+ src += 16;
+ dst += 16;
+ sec_pred += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
+
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht) \
+ uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx( \
+ const uint8_t *src, int32_t src_stride, int32_t x_offset, \
+ int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \
+ uint32_t *sse) { \
+ int32_t diff; \
+ uint32_t var; \
+ const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \
+ const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \
+ \
+ if (y_offset) { \
+ if (x_offset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_hv_lsx( \
+ src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_sse_diff_##wd##width_v_lsx( \
+ src, src_stride, ref, ref_stride, v_filter, ht, &diff); \
+ } \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ if (x_offset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_h_lsx( \
+ src, src_stride, ref, ref_stride, h_filter, ht, &diff); \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ var = vpx_variance##wd##x##ht##_lsx(src, src_stride, ref, ref_stride, \
+ sse); \
+ } \
+ } \
+ \
+ return var; \
+ }
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(8, 8)
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(16, 16)
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32)
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht) \
+ uint32_t vpx_sub_pixel_avg_variance64x##ht##_lsx( \
+ const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \
+ int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \
+ uint32_t *sse, const uint8_t *sec_pred) { \
+ int32_t diff; \
+ const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \
+ const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \
+ \
+ if (y_offset) { \
+ if (x_offset) { \
+ *sse = sub_pixel_avg_sse_diff_64width_hv_lsx( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
+ v_filter, ht, &diff); \
+ } else { \
+ *sse = sub_pixel_avg_sse_diff_64width_v_lsx( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+ &diff); \
+ } \
+ } else { \
+ if (x_offset) { \
+ *sse = sub_pixel_avg_sse_diff_64width_h_lsx( \
+ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+ &diff); \
+ } else { \
+ *sse = avg_sse_diff_64x##ht##_lsx(src_ptr, src_stride, ref_ptr, \
+ ref_stride, sec_pred, &diff); \
+ } \
+ } \
+ \
+ return VARIANCE_64Wx##ht##H(*sse, diff); \
+ }
+
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(64)
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c
new file mode 100644
index 0000000000..943a5c5a9b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ __m128i src0, src1, src2, src3;
+ __m128i pred0, pred1, pred2, pred3;
+ __m128i diff0, diff1;
+ __m128i reg0, reg1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t pred_stride2 = pred_stride << 1;
+ int32_t diff_stride2 = diff_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t pred_stride3 = pred_stride2 + pred_stride;
+ int32_t diff_stride3 = diff_stride2 + diff_stride;
+
+ DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0,
+ src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+ src2, src3);
+ DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0,
+ pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+ pred1, pred2, pred3);
+ DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2,
+ src0, src2, pred0, pred2);
+ DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0);
+ reg0 = __lsx_vilvl_b(src0, pred0);
+ reg1 = __lsx_vilvh_b(src0, pred0);
+ DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1);
+ __lsx_vstelm_d(diff0, diff_ptr, 0, 0);
+ __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1);
+ __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0);
+ __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1);
+}
+
+static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t pred_stride2 = pred_stride << 1;
+ int32_t dst_stride = diff_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t pred_stride3 = pred_stride2 + pred_stride;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t pred_stride4 = pred_stride2 << 1;
+ int32_t dst_stride3 = dst_stride + dst_stride2;
+
+ DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+ src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+ src2, src3);
+ DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+ pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+ pred1, pred2, pred3);
+ src_ptr += src_stride4;
+ pred_ptr += pred_stride4;
+
+ DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+ src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5,
+ src6, src7);
+ DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+ pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4,
+ pred5, pred6, pred7);
+
+ DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+ src4, src5, src6, src7);
+ __lsx_vst(src0, diff_ptr, 0);
+ __lsx_vstx(src1, diff_ptr, dst_stride);
+ __lsx_vstx(src2, diff_ptr, dst_stride2);
+ __lsx_vstx(src3, diff_ptr, dst_stride3);
+ diff_ptr += dst_stride2;
+ __lsx_vst(src4, diff_ptr, 0);
+ __lsx_vstx(src5, diff_ptr, dst_stride);
+ __lsx_vstx(src6, diff_ptr, dst_stride2);
+ __lsx_vstx(src7, diff_ptr, dst_stride3);
+}
+
+static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t pred_stride2 = pred_stride << 1;
+ int32_t dst_stride = diff_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t pred_stride3 = pred_stride2 + pred_stride;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t pred_stride4 = pred_stride2 << 1;
+ int32_t dst_stride3 = dst_stride + dst_stride2;
+ int16_t *diff_tmp = diff + 8;
+
+ DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+ pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+ src += src_stride4;
+ pred += pred_stride4;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ pred, pred_stride, src5, src6, src7, pred5);
+ DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+ src += src_stride4;
+ pred += pred_stride4;
+ DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg0, reg2, reg4, reg6);
+ DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg1, reg3, reg5, reg7);
+ DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+ src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+ pred0, pred1, pred2, pred3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+ pred4, pred5, pred6, pred7);
+ __lsx_vst(src0, diff, 0);
+ __lsx_vstx(src2, diff, dst_stride);
+ __lsx_vstx(src4, diff, dst_stride2);
+ __lsx_vstx(src6, diff, dst_stride3);
+ __lsx_vst(src1, diff_tmp, 0);
+ __lsx_vstx(src3, diff_tmp, dst_stride);
+ __lsx_vstx(src5, diff_tmp, dst_stride2);
+ __lsx_vstx(src7, diff_tmp, dst_stride3);
+ diff += dst_stride2;
+ diff_tmp += dst_stride2;
+ __lsx_vst(pred0, diff, 0);
+ __lsx_vstx(pred2, diff, dst_stride);
+ __lsx_vstx(pred4, diff, dst_stride2);
+ __lsx_vstx(pred6, diff, dst_stride3);
+ __lsx_vst(pred1, diff_tmp, 0);
+ __lsx_vstx(pred3, diff_tmp, dst_stride);
+ __lsx_vstx(pred5, diff_tmp, dst_stride2);
+ __lsx_vstx(pred7, diff_tmp, dst_stride3);
+ diff += dst_stride2;
+ diff_tmp += dst_stride2;
+ DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+ pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+ src += src_stride4;
+ pred += pred_stride4;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ pred, pred_stride, src5, src6, src7, pred5);
+ DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+ DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg0, reg2, reg4, reg6);
+ DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg1, reg3, reg5, reg7);
+ DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+ src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+ pred0, pred1, pred2, pred3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+ pred4, pred5, pred6, pred7);
+ __lsx_vst(src0, diff, 0);
+ __lsx_vstx(src2, diff, dst_stride);
+ __lsx_vstx(src4, diff, dst_stride2);
+ __lsx_vstx(src6, diff, dst_stride3);
+ __lsx_vst(src1, diff_tmp, 0);
+ __lsx_vstx(src3, diff_tmp, dst_stride);
+ __lsx_vstx(src5, diff_tmp, dst_stride2);
+ __lsx_vstx(src7, diff_tmp, dst_stride3);
+ diff += dst_stride2;
+ diff_tmp += dst_stride2;
+ __lsx_vst(pred0, diff, 0);
+ __lsx_vstx(pred2, diff, dst_stride);
+ __lsx_vstx(pred4, diff, dst_stride2);
+ __lsx_vstx(pred6, diff, dst_stride3);
+ __lsx_vst(pred1, diff_tmp, 0);
+ __lsx_vstx(pred3, diff_tmp, dst_stride);
+ __lsx_vstx(pred5, diff_tmp, dst_stride2);
+ __lsx_vstx(pred7, diff_tmp, dst_stride3);
+}
+
+static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ uint32_t loop_cnt;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t pred_stride2 = pred_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t pred_stride3 = pred_stride2 + pred_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t pred_stride4 = pred_stride2 << 1;
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ const uint8_t *src_tmp = src + 16;
+ const uint8_t *pred_tmp = pred + 16;
+ DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1,
+ pred0, pred1);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+ src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+ DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred,
+ pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3);
+ DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred,
+ pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7);
+ DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg0, reg2, reg4, reg6);
+ DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg1, reg3, reg5, reg7);
+ DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+ reg3, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+ reg7, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+ tmp3, pred0, pred1, pred2, pred3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+ tmp7, pred4, pred5, pred6, pred7);
+ src += src_stride4;
+ pred += pred_stride4;
+ __lsx_vst(src0, diff, 0);
+ __lsx_vst(src1, diff, 16);
+ __lsx_vst(src2, diff, 32);
+ __lsx_vst(src3, diff, 48);
+ diff += diff_stride;
+ __lsx_vst(src4, diff, 0);
+ __lsx_vst(src5, diff, 16);
+ __lsx_vst(src6, diff, 32);
+ __lsx_vst(src7, diff, 48);
+ diff += diff_stride;
+ __lsx_vst(pred0, diff, 0);
+ __lsx_vst(pred1, diff, 16);
+ __lsx_vst(pred2, diff, 32);
+ __lsx_vst(pred3, diff, 48);
+ diff += diff_stride;
+ __lsx_vst(pred4, diff, 0);
+ __lsx_vst(pred5, diff, 16);
+ __lsx_vst(pred6, diff, 32);
+ __lsx_vst(pred7, diff, 48);
+ diff += diff_stride;
+ }
+}
+
+static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ uint32_t loop_cnt;
+
+ for (loop_cnt = 32; loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1,
+ pred2, pred3);
+ src += src_stride;
+ pred += pred_stride;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+ src7);
+ DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5,
+ pred6, pred7);
+ src += src_stride;
+ pred += pred_stride;
+
+ DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg0, reg2, reg4, reg6);
+ DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+ reg1, reg3, reg5, reg7);
+ DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+ tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+ reg3, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+ reg7, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+ tmp3, pred0, pred1, pred2, pred3);
+ DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+ tmp7, pred4, pred5, pred6, pred7);
+ __lsx_vst(src0, diff, 0);
+ __lsx_vst(src1, diff, 16);
+ __lsx_vst(src2, diff, 32);
+ __lsx_vst(src3, diff, 48);
+ __lsx_vst(src4, diff, 64);
+ __lsx_vst(src5, diff, 80);
+ __lsx_vst(src6, diff, 96);
+ __lsx_vst(src7, diff, 112);
+ diff += diff_stride;
+ __lsx_vst(pred0, diff, 0);
+ __lsx_vst(pred1, diff, 16);
+ __lsx_vst(pred2, diff, 32);
+ __lsx_vst(pred3, diff, 48);
+ __lsx_vst(pred4, diff, 64);
+ __lsx_vst(pred5, diff, 80);
+ __lsx_vst(pred6, diff, 96);
+ __lsx_vst(pred7, diff, 112);
+ diff += diff_stride;
+ }
+}
+
+void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ if (rows == cols) {
+ switch (rows) {
+ case 4:
+ sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 8:
+ sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 16:
+ sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 32:
+ sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ case 64:
+ sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+ diff_stride);
+ break;
+ default:
+ vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+ } else {
+ vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h
new file mode 100644
index 0000000000..bd514831bf
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
+ do { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \
+ __m128i k0_m, k1_m, k2_m, k3_m; \
+ \
+ k0_m = __lsx_vreplgr2vr_h(cnst0); \
+ k1_m = __lsx_vreplgr2vr_h(cnst1); \
+ k2_m = __lsx_vpackev_h(k1_m, k0_m); \
+ \
+ DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m); \
+ DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m); \
+ \
+ DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \
+ k3_m = __lsx_vmulwod_w_h(s5_m, k1_m); \
+ s1_m = __lsx_vsub_w(s1_m, k3_m); \
+ k3_m = __lsx_vmulwod_w_h(s4_m, k1_m); \
+ s0_m = __lsx_vsub_w(s0_m, k3_m); \
+ \
+ out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \
+ \
+ DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m); \
+ out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \
+ } while (0)
+
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3) \
+ do { \
+ __m128i tp0_m, tp1_m; \
+ \
+ DUP2_ARG2(__lsx_vdp2_w_h, in0, in2, in1, in2, tp1_m, tp0_m); \
+ in3 = __lsx_vssrarni_h_w(tp1_m, tp0_m, DCT_CONST_BITS); \
+ } while (0)
+
+#endif // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c
new file mode 100644
index 0000000000..8fad342c71
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/variance_lsx.h"
+
+#define VARIANCE_WxH(sse, diff, shift) \
+ (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+ (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec;
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t ref_stride2 = ref_stride << 1;
+ int32_t ref_stride3 = ref_stride2 + ref_stride;
+ int32_t ref_stride4 = ref_stride2 << 1;
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr + src_stride, 0,
+ src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+ src2, src3);
+ src_ptr += src_stride4;
+ DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr + ref_stride, 0,
+ ref_ptr + ref_stride2, 0, ref_ptr + ref_stride3, 0, ref0, ref1,
+ ref2, ref3);
+ ref_ptr += ref_stride4;
+
+ DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+ src0, src1, ref0, ref1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i src, ref, vec;
+ __m128i avg = __lsx_vldi(0);
+ __m128i var = avg;
+
+ for (; ht_cnt--;) {
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+
+ src = __lsx_vld(src_ptr, 0);
+ src_ptr += src_stride;
+ ref = __lsx_vld(ref_ptr, 0);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src, ref, var, avg);
+ }
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height, int32_t *diff) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i avg = __lsx_vldi(0);
+ __m128i src0, src1, ref0, ref1;
+ __m128i vec;
+ __m128i var = avg;
+
+ for (; ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ src_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ }
+
+ vec = __lsx_vhaddw_w_h(avg, avg);
+ HADD_SW_S32(vec, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t *diff) {
+ int32_t res, ht_cnt = 32;
+ __m128i avg0 = __lsx_vldi(0);
+ __m128i src0, src1, src2, src3;
+ __m128i ref0, ref1, ref2, ref3;
+ __m128i vec0, vec1;
+ __m128i avg1 = avg0;
+ __m128i avg2 = avg0;
+ __m128i avg3 = avg0;
+ __m128i var = avg0;
+
+ for (; ht_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ src_ptr += src_stride;
+ DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ src_ptr += src_stride;
+ DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+ ref0, ref1, ref2, ref3);
+ ref_ptr += ref_stride;
+ CALC_MSE_AVG_B(src0, ref0, var, avg0);
+ CALC_MSE_AVG_B(src1, ref1, var, avg1);
+ CALC_MSE_AVG_B(src2, ref2, var, avg2);
+ CALC_MSE_AVG_B(src3, ref3, var, avg3);
+ }
+ vec0 = __lsx_vhaddw_w_h(avg0, avg0);
+ vec1 = __lsx_vhaddw_w_h(avg1, avg1);
+ vec0 = __lsx_vadd_w(vec0, vec1);
+ vec1 = __lsx_vhaddw_w_h(avg2, avg2);
+ vec0 = __lsx_vadd_w(vec0, vec1);
+ vec1 = __lsx_vhaddw_w_h(avg3, avg3);
+ vec0 = __lsx_vadd_w(vec0, vec1);
+ HADD_SW_S32(vec0, *diff);
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
+
+#define VPX_VARIANCE_WDXHT_LSX(wd, ht) \
+ uint32_t vpx_variance##wd##x##ht##_lsx( \
+ const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
+ int32_t ref_stride, uint32_t *sse) { \
+ int32_t diff; \
+ \
+ *sse = \
+ sse_diff_##wd##width_lsx(src, src_stride, ref, ref_stride, ht, &diff); \
+ \
+ return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ }
+
+static uint32_t sse_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height) {
+ int32_t res, ht_cnt = (height >> 2);
+ __m128i src, ref;
+ __m128i var = __lsx_vldi(0);
+
+ for (; ht_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+ }
+ HADD_SW_S32(var, res);
+ return res;
+}
+
+VPX_VARIANCE_WDXHT_LSX(8, 8)
+VPX_VARIANCE_WDXHT_LSX(16, 16)
+VPX_VARIANCE_WDXHT_LSX(32, 32)
+
+uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ int32_t diff;
+
+ *sse = sse_diff_64x64_lsx(src, src_stride, ref, ref_stride, &diff);
+
+ return VARIANCE_64Wx64H(*sse, diff);
+}
+
+uint32_t vpx_mse16x16_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride,
+ uint32_t *sse) {
+ *sse = sse_16width_lsx(src, src_stride, ref, ref_stride, 16);
+
+ return *sse;
+}
+
+void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride,
+ const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+ int32_t *sum) {
+ *sse = sse_diff_16width_lsx(src, src_stride, ref, ref_stride, 16, sum);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h
new file mode 100644
index 0000000000..cf9e9890ff
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define HADD_SW_S32(in0, in1) \
+ do { \
+ __m128i res0_m; \
+ \
+ res0_m = __lsx_vhaddw_d_w(in0, in0); \
+ res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
+ in1 = __lsx_vpickve2gr_w(res0_m, 0); \
+ } while (0)
+
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \
+ do { \
+ __m128i tmp0_m, tmp1_m; \
+ \
+ tmp0_m = __lsx_vshuf_b(in1, in0, mask); \
+ tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \
+ in2 = __lsx_vsrari_h(tmp1_m, shift); \
+ } while (0)
+
+#define CALC_MSE_B(src, ref, var) \
+ do { \
+ __m128i src_l0_m, src_l1_m; \
+ __m128i res_l0_m, res_l1_m; \
+ \
+ src_l0_m = __lsx_vilvl_b(src, ref); \
+ src_l1_m = __lsx_vilvh_b(src, ref); \
+ DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+ res_l0_m, res_l1_m); \
+ var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \
+ var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \
+ } while (0)
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) \
+ do { \
+ __m128i src_l0_m, src_l1_m; \
+ __m128i res_l0_m, res_l1_m; \
+ \
+ src_l0_m = __lsx_vilvl_b(src, ref); \
+ src_l1_m = __lsx_vilvh_b(src, ref); \
+ DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+ res_l0_m, res_l1_m); \
+ var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \
+ var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \
+ sub = __lsx_vadd_h(sub, res_l0_m); \
+ sub = __lsx_vadd_h(sub, res_l1_m); \
+ } while (0)
+
+#endif // VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
new file mode 100644
index 0000000000..1c59228813
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
@@ -0,0 +1,972 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1;
+ __m128i dst0, dst1, dst2, dst3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, tmp0, tmp1);
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst0 = __lsx_vilvl_w(dst1, dst0);
+ dst1 = __lsx_vilvl_w(dst3, dst2);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+ tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+ tmp0 = __lsx_vxori_b(tmp0, 128);
+ dst0 = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+ tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+ dst0 = __lsx_vilvl_d(tmp1, tmp0);
+
+ tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+ tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+ tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+ dst1 = __lsx_vilvl_d(tmp1, tmp0);
+
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, tmp0, tmp1);
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7,
+ tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst0, dst, 0, 3);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst1, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst1, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (height == 4) {
+ common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ int32_t loop_cnt = height >> 2;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1, dst2, dst3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *_src = (uint8_t *)src - 3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, tmp0,
+ tmp1, tmp2, tmp3);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst1, dst, 0, 1);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ int32_t loop_cnt = height >> 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+ dst0 = __lsx_vld(dst_tmp, 0);
+ dst1 = __lsx_vldx(dst_tmp, dst_stride);
+ dst_tmp += dst_stride2;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+ mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+ mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+ mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+ mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+ DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+ filter0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+ tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+ tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+ tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+ tmp7);
+ DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3);
+ DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3);
+ DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride);
+ dst += dst_stride2;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3, dst0, dst1;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1);
+ dst_tmp += dst_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+ mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+ mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+ mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+ mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+ DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+ filter0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+ tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+ tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+ tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+ tmp7);
+ DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ int32_t loop_cnt = height;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3, dst0, dst1;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+
+ DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+ src3 = __lsx_vld(src, 56);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+ __lsx_vst(out0, dst, 32);
+ __lsx_vst(out1, dst, 48);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, mask;
+ __m128i dst0, dst1, dst2, dst3, vec0, vec1, filt0;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ uint8_t *dst_tmp = dst;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec0, vec1);
+ vec0 = __lsx_vssrarni_bu_h(vec1, vec0, FILTER_BITS);
+ vec0 = __lsx_vavgr_bu(vec0, dst0);
+ __lsx_vstelm_w(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(vec0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(vec0, dst, 0, 3);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i vec4, vec5, vec6, vec7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *src_tmp1 = (uint8_t *)src + src_stride4;
+ uint8_t *dst_tmp = dst;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ src4 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_w, dst2, dst1, dst4, dst3, dst1, dst2);
+ dst1 = __lsx_vilvl_d(dst2, dst1);
+
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask, src7, src6, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+ FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+ res1, res2, res3);
+ DUP2_ARG2(__lsx_vilvl_d, res1, res0, res3, res2, res0, res2);
+ DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res2, dst1, res0, res2);
+
+ __lsx_vstelm_w(res0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 3);
+ dst += dst_stride;
+
+ __lsx_vstelm_w(res2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res2, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res2, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(res2, dst, 0, 3);
+ dst += dst_stride;
+}
+
+static void common_hz_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (height == 4) {
+ common_hz_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_hz_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, mask;
+ __m128i filt0, dst0, dst1, dst2, dst3;
+ __m128i vec0, vec1, vec2, vec3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ uint8_t *dst_tmp = dst;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec1);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec1, dst1, vec0, vec1);
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec1, dst, 0, 1);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ __m128i src0, src1, src2, src3, mask;
+ __m128i filt0, dst0, dst1, dst2, dst3;
+ __m128i vec0, vec1, vec2, vec3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ uint8_t *dst_tmp = dst;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec2);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+ DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 1);
+ dst += dst_stride;
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec2);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 1);
+ dst += dst_stride;
+
+ if (height == 16) {
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec2);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 1);
+ dst += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec2);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(vec2, dst, 0, 1);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (height == 4) {
+ common_hz_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2) - 1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, dst0;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *src_tmp1 = (uint8_t *)src + 8;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ src1 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+ src_tmp1 += src_stride4;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ res0, res1, res2, res3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+ res4, res5, res6, res7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+ FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, res0,
+ res2, res4, res6);
+ dst0 = __lsx_vld(dst, 0);
+ res0 = __lsx_vavgr_bu(res0, dst0);
+ __lsx_vst(res0, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res2 = __lsx_vavgr_bu(res2, dst0);
+ __lsx_vst(res2, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res4 = __lsx_vavgr_bu(res4, dst0);
+ __lsx_vst(res4, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res6 = __lsx_vavgr_bu(res6, dst0);
+ __lsx_vst(res6, dst, 0);
+ dst += dst_stride;
+
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ src1 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+ src_tmp1 += src_stride4;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, res0, res1, res2, res3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, res4, res5, res6, res7);
+
+ DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+ FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+ res0, res2, res4, res6);
+ dst0 = __lsx_vld(dst, 0);
+ res0 = __lsx_vavgr_bu(res0, dst0);
+ __lsx_vst(res0, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res2 = __lsx_vavgr_bu(res2, dst0);
+ __lsx_vst(res2, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res4 = __lsx_vavgr_bu(res4, dst0);
+ __lsx_vst(res4, dst, 0);
+ dst += dst_stride;
+
+ dst0 = __lsx_vld(dst, 0);
+ res6 = __lsx_vavgr_bu(res6, dst0);
+ __lsx_vst(res6, dst, 0);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 1);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, dst0, dst1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vld, src, 16, src, 24, src2, src3);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ src += src_stride;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vld, src, 16, src, 24, src6, src7);
+ src5 = __lsx_vshuf_b(src6, src4, shuff);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, res0, res1, res2, res3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, res4, res5, res6, res7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+ FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+ res0, res2, res4, res6);
+
+ DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+ res0 = __lsx_vavgr_bu(res0, dst0);
+ __lsx_vst(res0, dst, 0);
+ res2 = __lsx_vavgr_bu(res2, dst1);
+ __lsx_vst(res2, dst, 16);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+ res4 = __lsx_vavgr_bu(res4, dst0);
+ __lsx_vst(res4, dst, 0);
+ res6 = __lsx_vavgr_bu(res6, dst1);
+ __lsx_vst(res6, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, dst0, dst1, dst2, dst3;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+ src6);
+ src7 = __lsx_vld(src, 56);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+ src5 = __lsx_vshuf_b(src6, src4, shuff);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ out0, out2, out4, out6);
+
+ DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst1, dst2,
+ dst3);
+ out0 = __lsx_vavgr_bu(out0, dst0);
+ __lsx_vst(out0, dst, 0);
+ out2 = __lsx_vavgr_bu(out2, dst1);
+ __lsx_vst(out2, dst, 16);
+ out4 = __lsx_vavgr_bu(out4, dst2);
+ __lsx_vst(out4, dst, 32);
+ out6 = __lsx_vavgr_bu(out6, dst3);
+ __lsx_vst(out6, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ int8_t cnt, filt_hor[8];
+
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_x) == 2) {
+ switch (w) {
+ case 4:
+ common_hz_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 8:
+ common_hz_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 16:
+ common_hz_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+
+ case 32:
+ common_hz_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ case 64:
+ common_hz_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_hz_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 8:
+ common_hz_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 16:
+ common_hz_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 32:
+ common_hz_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ case 64:
+ common_hz_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, h);
+ break;
+ default:
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
new file mode 100644
index 0000000000..d1abf622ad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
@@ -0,0 +1,737 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ __m128i out0, out1;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+ filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ src4 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+ _src += src_stride3;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+
+ tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+ filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+ DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+ tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+ src10 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ src2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src4 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src5 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3);
+ src2 = __lsx_vilvl_d(src3, src2);
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+ tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+ src0 = __lsx_vpackev_b(src1, src0);
+ out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS);
+ out0 = __lsx_vxori_b(out0, 128);
+ out0 = __lsx_vavgr_bu(out0, src2);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+
+ tmp5 = src1;
+ tmp0 = tmp2;
+ tmp1 = tmp4;
+ tmp2 = src0;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ __m128i out0, out1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+ filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ src4 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+ _src += src_stride3;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+
+ src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+ filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+ DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ tmp0, tmp1, tmp2, tmp4);
+ DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+ src10 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp3 = __lsx_vpackev_b(src7, src6);
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src0 = __lsx_vpackev_b(src8, src7);
+ out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src1 = __lsx_vpackev_b(src9, src8);
+ src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src2 = __lsx_vpackev_b(src10, src9);
+ src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3,
+ FILTER_BITS, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ src5 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src7 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src8 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src9 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7);
+ DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+
+ src6 = src10;
+ tmp0 = tmp2;
+ tmp1 = tmp3;
+ tmp2 = src1;
+ tmp4 = tmp6;
+ tmp5 = src0;
+ tmp6 = src2;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+ common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+
+ common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+
+ for (multiple8_cnt = 8; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+ /* rearranging filter */
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+ hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+
+ dst0 = __lsx_vldrepl_w(dst, 0);
+ dst1 = __lsx_vldrepl_w(dst + dst_stride, 0);
+ dst2 = __lsx_vldrepl_w(dst + dst_stride2, 0);
+ dst3 = __lsx_vldrepl_w(dst + dst_stride3, 0);
+ dst0 = __lsx_vilvl_w(dst1, dst0);
+ dst1 = __lsx_vilvl_w(dst3, dst2);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ __m128i hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src5, src6, src7, src8);
+ src += src_stride4;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+ hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+ hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
+ DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+ hz_out1, hz_out3);
+ hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+ hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst0 = __lsx_vilvl_w(dst1, dst0);
+ dst1 = __lsx_vilvl_w(dst3, dst2);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst1 = __lsx_vilvl_w(dst2, dst1);
+ dst2 = __lsx_vilvl_w(dst4, dst3);
+ dst1 = __lsx_vilvl_d(dst2, dst1);
+
+ DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+ hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+ filt_vt, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, res0, res1);
+ DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1);
+
+ __lsx_vstelm_w(res0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 3);
+ dst += dst_stride;
+
+ __lsx_vstelm_w(res1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ if (height == 4) {
+ common_hv_2ht_2vt_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ } else if (height == 8) {
+ common_hv_2ht_2vt_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert) {
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ uint8_t *dst_tmp = dst;
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+ /* rearranging filter */
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
+ AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_hz, filt_vt, vec0;
+ __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ uint8_t *dst_tmp = dst;
+
+ /* rearranging filter */
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
+
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ if (height == 4) {
+ common_hv_2ht_2vt_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ } else {
+ common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+ src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ uint8_t *src_tmp1;
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, tmp3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride << 2;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+ /* rearranging filter */
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+ src_tmp1 = (uint8_t *)(src + 8);
+ src1 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+ src += src_stride4;
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+
+ hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp3 = __lsx_vavgr_bu(tmp3, dst0);
+ __lsx_vst(tmp3, dst, 0);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp3 = __lsx_vavgr_bu(tmp3, dst1);
+ __lsx_vstx(tmp3, dst, dst_stride);
+
+ hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp3 = __lsx_vavgr_bu(tmp3, dst2);
+ __lsx_vstx(tmp3, dst, dst_stride2);
+
+ hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp3 = __lsx_vavgr_bu(tmp3, dst3);
+ __lsx_vstx(tmp3, dst, dst_stride3);
+ dst += dst_stride4;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_32w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 16;
+ dst += 16;
+
+ common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_64w_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_hor[8], filt_ver[8];
+
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ filt_ver[cnt] = filter_y[cnt];
+ }
+ if (vpx_get_filter_taps(filter_x) == 2 &&
+ vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_hv_2ht_2vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], h);
+ break;
+ case 8:
+ common_hv_2ht_2vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], h);
+ break;
+ case 16:
+ common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 32:
+ common_hv_2ht_2vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 64:
+ common_hv_2ht_2vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else if (vpx_get_filter_taps(filter_x) == 2 ||
+ vpx_get_filter_taps(filter_y) == 2) {
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ } else {
+ switch (w) {
+ case 4:
+ common_hv_8ht_8vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 8:
+ common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 16:
+ common_hv_8ht_8vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 32:
+ common_hv_8ht_8vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ case 64:
+ common_hv_8ht_8vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor,
+ filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
new file mode 100644
index 0000000000..5c6413df44
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
@@ -0,0 +1,918 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ __m128i reg0, reg1, reg2, reg3, reg4;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i out0, out1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ src0 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+ src2);
+ src3 = __lsx_vldx(src_tmp0, src_stride3);
+ src_tmp0 += src_stride4;
+ src4 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+ src6);
+ src_tmp0 += src_stride3;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+ tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+ DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+ reg2 = __lsx_vilvl_d(tmp5, tmp2);
+ DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+ reg2 = __lsx_vxori_b(reg2, 128);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+ src9);
+ src10 = __lsx_vldx(src_tmp0, src_stride3);
+ src_tmp0 += src_stride4;
+ src0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1);
+ src0 = __lsx_vilvl_d(src1, src0);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+ DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
+ filter2, filter3);
+ out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
+ filter2, filter3);
+ out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+ out0 = __lsx_vxori_b(out0, 128);
+ out0 = __lsx_vavgr_bu(out0, src0);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+ reg0 = reg2;
+ reg1 = reg3;
+ reg2 = reg4;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ uint8_t *dst_tmp = dst;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i out0, out1, out2, out3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ src0 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+ src2);
+ src3 = __lsx_vldx(src_tmp0, src_stride3);
+ src_tmp0 += src_stride4;
+ src4 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+ src6);
+ src_tmp0 += src_stride3;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+ reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+ src9);
+ src10 = __lsx_vldx(src_tmp0, src_stride3);
+ src_tmp0 += src_stride4;
+ src0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ src3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1);
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ tmp0, tmp1, tmp2, tmp3);
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
+ filter2, filter3);
+ out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
+ filter2, filter3);
+ out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
+ filter2, filter3);
+ out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+ reg0 = reg2;
+ reg1 = tmp0;
+ reg2 = tmp2;
+ reg3 = reg5;
+ reg4 = tmp1;
+ reg5 = tmp3;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height, int32_t width) {
+ uint8_t *src_tmp;
+ uint32_t cnt = width >> 4;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+ uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ for (; cnt--;) {
+ uint32_t loop_cnt = height >> 2;
+ uint8_t *dst_reg = dst;
+
+ src_tmp = src_tmp0;
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+ src2);
+ src3 = __lsx_vldx(src_tmp, src_stride3);
+ src_tmp += src_stride4;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src_tmp += src_stride3;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ reg0, reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ reg6, reg7, reg8, reg9);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+ src9);
+ src10 = __lsx_vldx(src_tmp, src_stride3);
+ src_tmp += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+ src7, src8, src9, src10);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src4, src5, src7, src8);
+ tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ tmp2 = __lsx_vld(dst_reg, 0);
+ tmp3 = __lsx_vldx(dst_reg, dst_stride);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+ __lsx_vst(tmp0, dst_reg, 0);
+ __lsx_vstx(tmp1, dst_reg, dst_stride);
+ tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ tmp2 = __lsx_vldx(dst_reg, dst_stride2);
+ tmp3 = __lsx_vldx(dst_reg, dst_stride3);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+ __lsx_vstx(tmp0, dst_reg, dst_stride2);
+ __lsx_vstx(tmp1, dst_reg, dst_stride3);
+ dst_reg += dst_stride4;
+
+ reg0 = reg2;
+ reg1 = src0;
+ reg2 = src2;
+ reg3 = reg5;
+ reg4 = src1;
+ reg5 = src3;
+ reg6 = reg8;
+ reg7 = src4;
+ reg8 = src7;
+ reg9 = reg11;
+ reg10 = src5;
+ reg11 = src8;
+ src6 = src10;
+ }
+ src_tmp0 += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 64);
+}
+
+static void common_vt_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4;
+ __m128i dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+ __m128i src10_r, src32_r, src21_r, src43_r;
+ __m128i tmp0, tmp1;
+ uint8_t *dst_tmp = dst;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src4 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst0 = __lsx_vilvl_w(dst1, dst0);
+ dst1 = __lsx_vilvl_w(dst3, dst2);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+ src10_r, src21_r, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
+ src4332);
+ DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ out = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vstelm_w(out, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 3);
+ dst += dst_stride;
+}
+
+static void common_vt_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+ __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+ __m128i src2110, src4332, src6554, src8776, filt0;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ uint8_t *dst_tmp = dst;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src7 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src8 = __lsx_vld(src, 0);
+
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst0 = __lsx_vilvl_w(dst1, dst0);
+ dst1 = __lsx_vilvl_w(dst3, dst2);
+ dst0 = __lsx_vilvl_d(dst1, dst0);
+
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst1 = __lsx_vilvl_w(dst2, dst1);
+ dst2 = __lsx_vilvl_w(dst4, dst3);
+ dst1 = __lsx_vilvl_d(dst2, dst1);
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+ src10_r, src21_r, src32_r, src43_r);
+ DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+ src54_r, src65_r, src76_r, src87_r);
+ DUP4_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+ src87_r, src76_r, src2110, src4332, src6554, src8776);
+ DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0,
+ src8776, filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 3);
+ dst += dst_stride;
+
+ __lsx_vstelm_w(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp2, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp2, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp2, dst, 0, 3);
+}
+
+static void common_vt_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (height == 4) {
+ common_vt_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_vt_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4;
+ __m128i dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ uint8_t *dst_tmp = dst;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec1);
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_lsx(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 3);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ uint8_t *dst_tmp = dst;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src5 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7);
+ src8 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+ dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst4 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst5 = __lsx_vldrepl_d(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ DUP2_ARG2(__lsx_vilvl_d, dst3, dst2, dst5, dst4, dst2, dst3);
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+ vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+ vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst2, tmp2, dst3, tmp0, tmp2);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+ dst += dst_stride;
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int8_t *filter,
+ int32_t height) {
+ if (height == 4) {
+ common_vt_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height);
+ }
+}
+
+static void common_vt_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i tmp0, tmp1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ uint8_t *src_tmp1;
+ uint8_t *dst_tmp1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ __m128i tmp0, tmp1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+
+ src_tmp1 = src + 16;
+ src6 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src7,
+ src8);
+ src9 = __lsx_vldx(src_tmp1, src_stride3);
+
+ dst_tmp1 = dst + 16;
+ dst4 = __lsx_vld(dst_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2, dst5,
+ dst6);
+ dst7 = __lsx_vldx(dst_tmp1, dst_stride3);
+ src += src_stride4;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vst(tmp0, dst, 0);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+ __lsx_vstx(tmp0, dst, dst_stride);
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+ __lsx_vstx(tmp0, dst, dst_stride2);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+ __lsx_vstx(tmp0, dst, dst_stride3);
+
+ DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+ __lsx_vst(tmp0, dst, 16);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+ dst += dst_stride;
+ __lsx_vst(tmp0, dst, 16);
+
+ DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+ dst += dst_stride;
+ __lsx_vst(tmp0, dst, 16);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+ dst += dst_stride;
+ __lsx_vst(tmp0, dst, 16);
+ dst += dst_stride;
+
+ src0 = src4;
+ src5 = src9;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 1);
+ int32_t src_stride2 = src_stride << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ uint8_t *src_tmp1;
+ uint8_t *dst_tmp1;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src6, src7, src8, src9, src10, src11, filt0;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i tmp0, tmp1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+ src9);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src2 = __lsx_vldx(src, src_stride);
+ dst1 = __lsx_vldx(dst, dst_stride);
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+ src10);
+ DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst2, dst4,
+ dst6);
+ src_tmp1 = (uint8_t *)src + 16;
+ src5 = __lsx_vldx(src_tmp1, src_stride);
+ src_tmp1 = src_tmp1 + 16;
+ src8 = __lsx_vldx(src_tmp1, src_stride);
+ src_tmp1 = src_tmp1 + 16;
+ src11 = __lsx_vldx(src_tmp1, src_stride);
+
+ dst_tmp1 = dst + 16;
+ dst3 = __lsx_vldx(dst_tmp1, dst_stride);
+ dst_tmp1 = dst + 32;
+ dst5 = __lsx_vldx(dst_tmp1, dst_stride);
+ dst_tmp1 = dst + 48;
+ dst7 = __lsx_vldx(dst_tmp1, dst_stride);
+ src += src_stride2;
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+ __lsx_vst(tmp0, dst, 0);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+ __lsx_vstx(tmp0, dst, dst_stride);
+
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+ __lsx_vst(tmp0, dst, 16);
+
+ dst_tmp1 = dst + 16;
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+ __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+ __lsx_vst(tmp0, dst, 32);
+
+ dst_tmp1 = dst_tmp1 + 16;
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+ __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+ __lsx_vst(tmp0, dst, 48);
+
+ dst_tmp1 = dst_tmp1 + 16;
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+ __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+ dst += dst_stride2;
+
+ src0 = src2;
+ src3 = src5;
+ src6 = src8;
+ src9 = src11;
+ }
+}
+
+void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_ver[8];
+
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_vt_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 8:
+ common_vt_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 16:
+ common_vt_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 32:
+ common_vt_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ case 64:
+ common_vt_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_vt_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 8:
+ common_vt_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 16:
+ common_vt_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+
+ break;
+ case 32:
+ common_vt_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ case 64:
+ common_vt_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
new file mode 100644
index 0000000000..2c6459a978
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
@@ -0,0 +1,814 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter) {
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out, out0, out1;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, out0, out1);
+ out = __lsx_vssrarni_b_h(out1, out0, 7);
+ out = __lsx_vxori_b(out, 128);
+ __lsx_vstelm_w(out, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out, dst, 0, 3);
+}
+
+static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter) {
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+ uint8_t *_src = (uint8_t *)src - 3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, out0, out1);
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter) {
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filter0, filter1, filter2, filter3, out0, out1,
+ out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+}
+
+static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+ uint8_t *_src = (uint8_t *)src - 3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 1;
+ int32_t stride = src_stride << 1;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; loop_cnt--;) {
+ const uint8_t *_src = src + src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2);
+ DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vst(out0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out1, dst, 0);
+ dst += dst_stride;
+ src += stride;
+ }
+}
+
+static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 1;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ src += src_stride;
+
+ dst += dst_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ int32_t loop_cnt = height;
+ __m128i src0, src1, src2, src3;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i out0, out1, out2, out3;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= 3;
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+
+ DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+ src3 = __lsx_vld(src, 56);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filter0, filter1, filter2, filter3, out0,
+ out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vst(out0, dst, 32);
+ __lsx_vst(out1, dst, 48);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, mask;
+ __m128i filt0, vec0, vec1, vec2, vec3, res0, res1;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride + dst_stride2;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3,
+ FILTER_BITS, res0, res1);
+
+ __lsx_vstelm_w(res0, dst, 0, 0);
+ __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i res0, res1, res2, res3, filt0;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride + dst_stride2;
+
+ uint8_t *src_tmp1 = src + src_stride4;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
+ src7, src6, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+ FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+ res1, res2, res3);
+
+ __lsx_vstelm_w(res0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 1);
+ dst += dst_stride;
+
+ __lsx_vstelm_w(res2, dst, 0, 0);
+ __lsx_vstelm_w(res2, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_hz_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_hz_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i filt0, mask;
+ __m128i src0, src1, src2, src3;
+ __m128i vec0, vec1, vec2, vec3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+ src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, vec0, vec1);
+
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ __m128i filt0, mask;
+ __m128i src0, src1, src2, src3, out0, out1;
+ __m128i vec0, vec1, vec2, vec3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+ src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+ src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+
+ if (height == 16) {
+ uint8_t *dst_tmp1 = dst + dst_stride4;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+ mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+ mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst_tmp1, 0, 0);
+ __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst_tmp1 + dst_stride2, 0, 0);
+ __lsx_vstelm_d(out1, dst_tmp1 + dst_stride3, 0, 1);
+ }
+}
+
+static void common_hz_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_hz_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2) - 1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ uint8_t *src_tmp1 = src + 8;
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+ src1 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+ src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, mask,
+ src7, src7, mask, vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+ out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0,
+ out1, out2, out3);
+
+ __lsx_vst(out0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out3, dst, 0);
+ dst += dst_stride;
+
+ for (; loop_cnt--;) {
+ src_tmp1 += src_stride4;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+ src6 = __lsx_vldx(src, src_stride3);
+
+ src1 = __lsx_vld(src_tmp1, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+ src5);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+ mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+ mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ out0, out1, out2, out3);
+
+ __lsx_vst(out0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(out3, dst, 0);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 1);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+ src3 = __lsx_vld(src, 24);
+ src1 = __lsx_vshuf_b(src2, src0, shuff);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src4, src6);
+ src7 = __lsx_vld(src, 24);
+ src5 = __lsx_vshuf_b(src6, src4, shuff);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+ mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+ mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ out0, out1, out2, out3);
+
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+ dst += dst_stride;
+
+ __lsx_vst(out2, dst, 0);
+ __lsx_vst(out3, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ for (; loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+ src6);
+ src7 = __lsx_vld(src, 56);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+ src5 = __lsx_vshuf_b(src6, src4, shuff);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+ mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+ mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+ FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+ out0, out1, out2, out3);
+
+ __lsx_vst(out0, dst, 0);
+ __lsx_vst(out1, dst, 16);
+ __lsx_vst(out2, dst, 32);
+ __lsx_vst(out3, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ int8_t cnt, filt_hor[8];
+
+ assert(x_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ }
+ if (vpx_get_filter_taps(filter_x) == 2) {
+ switch (w) {
+ case 4:
+ common_hz_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 8:
+ common_hz_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 16:
+ common_hz_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 32:
+ common_hz_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 64:
+ common_hz_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ default:
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_hz_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 8:
+ common_hz_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+
+ case 16:
+ common_hz_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+
+ case 32:
+ common_hz_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+
+ case 64:
+ common_hz_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ default:
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c
new file mode 100644
index 0000000000..9f5cd6cfe9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ __m128i out0, out1;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+ src -= (3 + 3 * src_stride);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+ filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+ src4 = __lsx_vld(src, 0);
+ src += src_stride;
+ src5 = __lsx_vld(src, 0);
+ src += src_stride;
+ src6 = __lsx_vld(src, 0);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+
+ tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+ filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+ DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+ tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+
+ for (; loop_cnt--;) {
+ LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+ tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+ src0 = __lsx_vpackev_b(src1, src0);
+ out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+ out0 = __lsx_vxori_b(out0, 128);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+
+ tmp5 = src1;
+ tmp0 = tmp2;
+ tmp1 = tmp4;
+ tmp2 = src0;
+ }
+}
+
+static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ __m128i out0, out1;
+
+ mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+ src -= (3 + 3 * src_stride);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+ filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+ src += src_stride;
+ src4 = __lsx_vld(src, 0);
+ src += src_stride;
+ src5 = __lsx_vld(src, 0);
+ src += src_stride;
+ src6 = __lsx_vld(src, 0);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+
+ src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+ filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+ DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ tmp0, tmp1, tmp2, tmp4);
+ DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+ for (; loop_cnt--;) {
+ LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ tmp3 = __lsx_vpackev_b(src7, src6);
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src0 = __lsx_vpackev_b(src8, src7);
+ out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src1 = __lsx_vpackev_b(src9, src8);
+ src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ src2 = __lsx_vpackev_b(src10, src9);
+ src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+
+ src6 = src10;
+ tmp0 = tmp2;
+ tmp1 = tmp3;
+ tmp2 = src1;
+ tmp4 = tmp6;
+ tmp5 = src0;
+ tmp6 = src2;
+ }
+}
+
+static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+
+ common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+}
+
+static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 8; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_vt, filt_hz, vec0, vec1;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+
+ hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+ hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1,
+ FILTER_BITS, tmp0, tmp1);
+
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+ /* rearranging filter */
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src5, src6, src7, src8);
+ src += src_stride4;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+ hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+ hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
+
+ DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+ hz_out1, hz_out3);
+ hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+ hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+ DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+ hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+ filt_vt, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+ FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4,
+ vec5, vec6, vec7);
+
+ __lsx_vstelm_w(vec4, dst, 0, 0);
+ __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1);
+ dst += dst_stride4;
+ __lsx_vstelm_w(vec6, dst, 0, 0);
+ __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ if (height == 4) {
+ common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else if (height == 8) {
+ common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
+
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
+ int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert, int32_t height) {
+ uint32_t loop_cnt = (height >> 3);
+ __m128i src0, src1, src2, src3, src4, mask;
+ __m128i filt_hz, filt_vt, vec0;
+ __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+ FILTER_BITS, tmp1, tmp2);
+
+ __lsx_vstelm_d(tmp1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+ dst += dst_stride;
+
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+ tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+ FILTER_BITS, tmp1, tmp2);
+
+ __lsx_vstelm_d(tmp1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ if (height == 4) {
+ common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else {
+ common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ }
+}
+
+static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ __m128i filt_hz, filt_vt, vec0, vec1;
+ __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+ /* rearranging filter */
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+
+ for (; loop_cnt--;) {
+ uint8_t *src_tmp0 = src + 8;
+
+ DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src,
+ src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5);
+ DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7);
+ src += src_stride4;
+
+ hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+ tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+
+ common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+}
+
+static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int32_t x_step_q4, int y0_q4,
+ int32_t y_step_q4, int32_t w, int32_t h) {
+ const int16_t *const filter_x = filter[x0_q4];
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_hor[8], filt_ver[8];
+
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_x)[1] != 0x800000);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_x) == 2 &&
+ vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 8:
+ common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 16:
+ common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 32:
+ common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ case 64:
+ common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, &filt_hor[3],
+ &filt_ver[3], (int32_t)h);
+ break;
+ default:
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else if (vpx_get_filter_taps(filter_x) == 2 ||
+ vpx_get_filter_taps(filter_y) == 2) {
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h);
+ } else {
+ switch (w) {
+ case 4:
+ common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 8:
+ common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 16:
+ common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 32:
+ common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ case 64:
+ common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst,
+ (int32_t)dst_stride, filt_hor, filt_ver,
+ (int32_t)h);
+ break;
+ default:
+ vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
new file mode 100644
index 0000000000..6022e43c83
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
@@ -0,0 +1,825 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ __m128i reg0, reg1, reg2, reg3, reg4;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i out0, out1;
+ uint8_t *_src = (uint8_t *)src - src_stride3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+ src0 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ src4 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+ _src += src_stride3;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+ tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+ DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+ reg2 = __lsx_vilvl_d(tmp5, tmp2);
+ DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+ reg2 = __lsx_vxori_b(reg2, 128);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+ src10 = __lsx_vldx(_src, src_stride3);
+ _src += src_stride4;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+ DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
+ filter2, filter3);
+ out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
+ filter2, filter3);
+ out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+ out0 = __lsx_vxori_b(out0, 128);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+
+ reg0 = reg2;
+ reg1 = reg3;
+ reg2 = reg4;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i out0, out1, out2, out3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ src = src - src_stride3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src += src_stride3;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+ reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+ src10 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ tmp0, tmp1, tmp2, tmp3);
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
+ filter2, filter3);
+ out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
+ filter2, filter3);
+ out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
+ filter2, filter3);
+ out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(out1, dst, 0, 1);
+ dst += dst_stride;
+
+ reg0 = reg2;
+ reg1 = tmp0;
+ reg2 = tmp2;
+ reg3 = reg5;
+ reg4 = tmp1;
+ reg5 = tmp3;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = height >> 2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ // uint8_t *_src = (uint8_t *)src - src_stride3;
+ src -= src_stride3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src += src_stride3;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+ reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, reg6,
+ reg7, reg8, reg9);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+ src10 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src4, src5, src7, src8);
+ tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(tmp1, dst, 0);
+ dst += dst_stride;
+ tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vst(tmp0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(tmp1, dst, 0);
+ dst += dst_stride;
+
+ reg0 = reg2;
+ reg1 = src0;
+ reg2 = src2;
+ reg3 = reg5;
+ reg4 = src1;
+ reg5 = src3;
+ reg6 = reg8;
+ reg7 = src4;
+ reg8 = src7;
+ reg9 = reg11;
+ reg10 = src5;
+ reg11 = src8;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height,
+ int32_t width) {
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ uint32_t cnt = width >> 4;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride + src_stride2;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+ src -= src_stride3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filter0, filter1, filter2, filter3);
+
+ for (; cnt--;) {
+ uint32_t loop_cnt = height >> 2;
+
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+ src2);
+ src3 = __lsx_vldx(src_tmp, src_stride3);
+ src_tmp += src_stride4;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src_tmp += src_stride3;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+ src6 = __lsx_vxori_b(src6, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ reg0, reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ reg6, reg7, reg8, reg9);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+ for (; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+ src9);
+ src10 = __lsx_vldx(src_tmp, src_stride3);
+ src_tmp += src_stride4;
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+ src7, src8, src9, src10);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src4, src5, src7, src8);
+ tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vst(tmp0, dst_tmp, 0);
+ __lsx_vstx(tmp1, dst_tmp, dst_stride);
+ tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+ filter2, filter3);
+ tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+ filter2, filter3);
+ tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+ filter2, filter3);
+ tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+ filter2, filter3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vstx(tmp0, dst_tmp, dst_stride2);
+ __lsx_vstx(tmp1, dst_tmp, dst_stride3);
+ dst_tmp += dst_stride4;
+
+ reg0 = reg2;
+ reg1 = src0;
+ reg2 = src2;
+ reg3 = reg5;
+ reg4 = src1;
+ reg5 = src3;
+ reg6 = reg8;
+ reg7 = src4;
+ reg8 = src7;
+ reg9 = reg11;
+ reg10 = src5;
+ reg11 = src8;
+ src6 = src10;
+ }
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+ 32);
+}
+
+static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+ 64);
+}
+
+static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+ __m128i filt0, tmp0, tmp1;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += (src_stride4 + src_stride);
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+ vec1, vec2, vec3);
+ DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+ __m128i vec6, vec7, vec8, vec9, vec10, vec11;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i filt0;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+ uint8_t *dst_tmp1 = dst + dst_stride4;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src5, src6, src7, src8);
+ src += (src_stride4 + src_stride);
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+ vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4,
+ vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8,
+ vec9, vec10, vec11);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, tmp0, tmp1);
+
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+
+ __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0);
+ __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1);
+ __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2);
+ __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_vt_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_vt_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+ __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+ vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+}
+
+static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 3);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src5 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7)
+ src8 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+ vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+ vec4, vec5, vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+ dst += dst_stride4;
+
+ DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+ filt0, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+ FILTER_BITS, out0, out1);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+ dst += dst_stride4;
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_vt_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ src0 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+ src4 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 2);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ __m128i tmp, tmp0, tmp1;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ uint8_t *src_tmp;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
+ src += src_stride;
+ src_tmp = src + 16;
+
+ for (; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src1, src6);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+ src_stride2, src_tmp, src_stride2, src2, src7, src3, src8);
+ DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src4, src9);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ src += src_stride4;
+ src_tmp += src_stride4;
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vstx(tmp, dst, dst_stride);
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vstx(tmp, dst, dst_stride2);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vstx(tmp, dst, dst_stride3);
+
+ DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 16);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ dst += dst_stride;
+ __lsx_vst(tmp, dst, 16);
+
+ DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ dst += dst_stride;
+ __lsx_vst(tmp, dst, 16);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ dst += dst_stride;
+ __lsx_vst(tmp, dst, 16);
+
+ dst += dst_stride;
+
+ src0 = src4;
+ src5 = src9;
+ }
+}
+
+static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt = (height >> 1);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ __m128i tmp, tmp0, tmp1;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t dst_stride2 = dst_stride << 1;
+ uint8_t *dst_tmp1 = dst + dst_stride;
+
+ filt0 = __lsx_vldrepl_h(filter, 0);
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+ src9);
+ src += src_stride;
+
+ for (; loop_cnt--;) {
+ uint8_t *src_tmp0 = src + src_stride;
+
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+ src10);
+ DUP4_ARG2(__lsx_vld, src_tmp0, 0, src_tmp0, 16, src_tmp0, 32, src_tmp0, 48,
+ src2, src5, src8, src11);
+ src += src_stride2;
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 0);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst_tmp1, 0);
+
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 16);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst_tmp1, 16);
+
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+ DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 32);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst_tmp1, 32);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+ DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst, 48);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+ tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+ __lsx_vst(tmp, dst_tmp1, 48);
+ dst += dst_stride2;
+ dst_tmp1 += dst_stride2;
+
+ src0 = src2;
+ src3 = src5;
+ src6 = src8;
+ src9 = src11;
+ }
+}
+
+void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16_t *const filter_y = filter[y0_q4];
+ int8_t cnt, filt_ver[8];
+
+ assert(y_step_q4 == 16);
+ assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+ for (cnt = 8; cnt--;) {
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (vpx_get_filter_taps(filter_y) == 2) {
+ switch (w) {
+ case 4:
+ common_vt_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 8:
+ common_vt_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 16:
+ common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 32:
+ common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 64:
+ common_vt_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_vt_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 8:
+ common_vt_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 16:
+ common_vt_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 32:
+ common_vt_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 64:
+ common_vt_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
new file mode 100644
index 0000000000..1dad29eeed
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void avg_width4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ __m128i src0, src1;
+ __m128i dst0, dst1;
+
+ int32_t src_stride2 = src_stride << 1;
+
+ if ((height % 2) == 0) {
+ for (cnt = (height / 2); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ src1 = __lsx_vldx(src, src_stride);
+ src += src_stride2;
+
+ dst0 = __lsx_vld(dst, 0);
+ dst1 = __lsx_vldx(dst, dst_stride);
+ DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(dst1, dst, 0, 0);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, int32_t height) {
+ int32_t cnt = (height / 4);
+ __m128i src0, src1, src2, src3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+ for (; cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+
+ DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst3, dst, 0, 0);
+ dst += dst_stride;
+ }
+}
+
+static void avg_width16_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt = (height / 8);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ for (; cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src7 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ dst0 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+ dst3 = __lsx_vldx(dst, dst_stride3);
+ dst += dst_stride4;
+ dst4 = __lsx_vld(dst, 0);
+ DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst5, dst6);
+ dst7 = __lsx_vldx(dst, dst_stride3);
+ dst -= dst_stride4;
+
+ DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+ dst4, dst5, dst6, dst7);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride);
+ __lsx_vstx(dst2, dst, dst_stride2);
+ __lsx_vstx(dst3, dst, dst_stride3);
+ dst += dst_stride4;
+ __lsx_vst(dst4, dst, 0);
+ __lsx_vstx(dst5, dst, dst_stride);
+ __lsx_vstx(dst6, dst, dst_stride2);
+ __lsx_vstx(dst7, dst, dst_stride3);
+ dst += dst_stride4;
+ }
+}
+
+static void avg_width32_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt = (height / 8);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ int32_t dst_stride2 = dst_stride << 1;
+ int32_t dst_stride3 = dst_stride2 + dst_stride;
+ int32_t dst_stride4 = dst_stride2 << 1;
+
+ for (; cnt--;) {
+ uint8_t *dst_tmp = dst;
+ uint8_t *dst_tmp1 = dst_tmp + 16;
+ uint8_t *src_tmp = src + 16;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src0, src1);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+ src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+ DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src6, src7);
+ src += src_stride4;
+
+ DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst0, dst1);
+ DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+ dst_stride2, dst_tmp1, dst_stride2, dst2, dst3, dst4, dst5);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst6,
+ dst7);
+ dst_tmp += dst_stride4;
+ dst_tmp1 += dst_stride4;
+
+ src_tmp = src + 16;
+ DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src8, src9);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+ src_stride2, src_tmp, src_stride2, src10, src11, src12, src13);
+ DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src14, src15);
+ src += src_stride4;
+
+ DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst8, dst9);
+ DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+ dst_stride2, dst_tmp1, dst_stride2, dst10, dst11, dst12, dst13);
+ DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst14,
+ dst15);
+ DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+ dst4, dst5, dst6, dst7);
+ DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+ dst11, dst8, dst9, dst10, dst11);
+ DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+ dst15, dst12, dst13, dst14, dst15);
+
+ dst_tmp = dst + 16;
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst2, dst, dst_stride);
+ __lsx_vstx(dst4, dst, dst_stride2);
+ __lsx_vstx(dst6, dst, dst_stride3);
+ __lsx_vst(dst1, dst_tmp, 0);
+ __lsx_vstx(dst3, dst_tmp, dst_stride);
+ __lsx_vstx(dst5, dst_tmp, dst_stride2);
+ __lsx_vstx(dst7, dst_tmp, dst_stride3);
+ dst += dst_stride4;
+
+ __lsx_vst(dst8, dst, 0);
+ __lsx_vstx(dst10, dst, dst_stride);
+ __lsx_vstx(dst12, dst, dst_stride2);
+ __lsx_vstx(dst14, dst, dst_stride3);
+ __lsx_vst(dst9, dst_tmp1, 0);
+ __lsx_vstx(dst11, dst_tmp1, dst_stride);
+ __lsx_vstx(dst13, dst_tmp1, dst_stride2);
+ __lsx_vstx(dst15, dst_tmp1, dst_stride3);
+ dst += dst_stride4;
+ }
+}
+
+static void avg_width64_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt = (height / 4);
+ uint8_t *dst_tmp = dst;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ for (; cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+ src7);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src8, src9, src10,
+ src11);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src12, src13, src14,
+ src15);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+ dst0, dst1, dst2, dst3);
+ dst_tmp += dst_stride;
+ DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+ dst4, dst5, dst6, dst7);
+ dst_tmp += dst_stride;
+ DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+ dst8, dst9, dst10, dst11);
+ dst_tmp += dst_stride;
+ DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+ dst12, dst13, dst14, dst15);
+ dst_tmp += dst_stride;
+
+ DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+ dst4, dst5, dst6, dst7);
+ DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+ dst11, dst8, dst9, dst10, dst11);
+ DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+ dst15, dst12, dst13, dst14, dst15);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ __lsx_vst(dst3, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(dst4, dst, 0);
+ __lsx_vst(dst5, dst, 16);
+ __lsx_vst(dst6, dst, 32);
+ __lsx_vst(dst7, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(dst8, dst, 0);
+ __lsx_vst(dst9, dst, 16);
+ __lsx_vst(dst10, dst, 32);
+ __lsx_vst(dst11, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(dst12, dst, 0);
+ __lsx_vst(dst13, dst, 16);
+ __lsx_vst(dst14, dst, 32);
+ __lsx_vst(dst15, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+ int32_t w, int32_t h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ switch (w) {
+ case 4: {
+ avg_width4_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+
+ case 8: {
+ avg_width8_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ avg_width16_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ avg_width32_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ avg_width64_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ int32_t lp, cnt;
+ for (cnt = h; cnt--;) {
+ for (lp = 0; lp < w; ++lp) {
+ dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
new file mode 100644
index 0000000000..53dc7097ed
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void copy_width8_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ if ((height % 12) == 0) {
+ for (cnt = (height / 12); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src += src_stride2;
+ src7 = __lsx_vldx(src, src_stride);
+ src += src_stride2;
+
+ __lsx_vstelm_d(src0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src3, dst, 0, 0);
+ dst += dst_stride;
+
+ __lsx_vstelm_d(src4, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src5, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src6, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src7, dst, 0, 0);
+ dst += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ __lsx_vstelm_d(src0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src3, dst, 0, 0);
+ dst += dst_stride;
+ }
+ } else if ((height % 8) == 0) {
+ for (cnt = height >> 3; cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src += src_stride2;
+ src7 = __lsx_vldx(src, src_stride);
+ src += src_stride2;
+
+ __lsx_vstelm_d(src0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src3, dst, 0, 0);
+ dst += dst_stride;
+
+ __lsx_vstelm_d(src4, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src5, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src6, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src7, dst, 0, 0);
+ dst += dst_stride;
+ }
+ } else if ((height % 4) == 0) {
+ for (cnt = (height / 4); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ __lsx_vstelm_d(src0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src3, dst, 0, 0);
+ dst += dst_stride;
+ }
+ } else if ((height % 2) == 0) {
+ for (cnt = (height / 2); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ src1 = __lsx_vldx(src, src_stride);
+ src += src_stride2;
+
+ __lsx_vstelm_d(src0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(src1, dst, 0, 0);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void copy_16multx8mult_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height, int32_t width) {
+ int32_t cnt, loop_cnt;
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = (uint8_t *)src;
+ dst_tmp = dst;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP4_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src_tmp,
+ src_stride3, src_tmp, src_stride4, src1, src2, src3, src4);
+ src_tmp += src_stride4;
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src_tmp += src_stride2;
+ src7 = __lsx_vldx(src_tmp, src_stride);
+ src_tmp += src_stride2;
+
+ __lsx_vst(src0, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src1, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src2, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src3, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src4, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src5, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src6, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src7, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ }
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void copy_width16_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ if ((height % 12) == 0) {
+ for (cnt = (height / 12); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+ src, src_stride4, src1, src2, src3, src4);
+ src += src_stride4;
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+ src += src_stride2;
+ src7 = __lsx_vldx(src, src_stride);
+ src += src_stride2;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src4, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src5, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src6, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src7, dst, 0);
+ dst += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+ }
+ } else if ((height % 8) == 0) {
+ copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 16);
+ } else if ((height % 4) == 0) {
+ for (cnt = (height >> 2); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void copy_width32_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride3 = src_stride2 + src_stride;
+ int32_t src_stride4 = src_stride2 << 1;
+
+ if ((height % 12) == 0) {
+ for (cnt = (height / 12); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ src_tmp = (uint8_t *)src + 16;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+
+ dst_tmp = dst + 16;
+ __lsx_vst(src4, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src5, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src6, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src7, dst_tmp, 0);
+ dst_tmp += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ src_tmp = (uint8_t *)src + 16;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+
+ dst_tmp = dst + 16;
+ __lsx_vst(src4, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src5, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src6, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src7, dst_tmp, 0);
+ dst_tmp += dst_stride;
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ src_tmp = (uint8_t *)src + 16;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+
+ dst_tmp = dst + 16;
+ __lsx_vst(src4, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src5, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src6, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src7, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ }
+ } else if ((height % 8) == 0) {
+ copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 32);
+ } else if ((height % 4) == 0) {
+ for (cnt = (height >> 2); cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride3);
+
+ src_tmp = (uint8_t *)src + 16;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+ src6);
+ src7 = __lsx_vldx(src_tmp, src_stride3);
+ src += src_stride4;
+
+ __lsx_vst(src0, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src1, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src2, dst, 0);
+ dst += dst_stride;
+ __lsx_vst(src3, dst, 0);
+ dst += dst_stride;
+
+ dst_tmp = dst + 16;
+ __lsx_vst(src4, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src5, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src6, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ __lsx_vst(src7, dst_tmp, 0);
+ dst_tmp += dst_stride;
+ }
+ }
+}
+
+static void copy_width64_lsx(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+ int32_t w, int32_t h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ switch (w) {
+ case 4: {
+ uint32_t cnt;
+ __m128i tmp;
+ for (cnt = h; cnt--;) {
+ tmp = __lsx_vldrepl_w(src, 0);
+ __lsx_vstelm_w(tmp, dst, 0, 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ case 8: {
+ copy_width8_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ copy_width16_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ copy_width32_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ copy_width64_lsx(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ uint32_t cnt;
+ for (cnt = h; cnt--;) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h
new file mode 100644
index 0000000000..d886b00198
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1,
+ __m128i _reg2, __m128i _reg3,
+ __m128i _filter0, __m128i _filter1,
+ __m128i _filter2, __m128i _filter3) {
+ __m128i _vec0, _vec1;
+
+ _vec0 = __lsx_vdp2_h_b(_reg0, _filter0);
+ _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1);
+ _vec1 = __lsx_vdp2_h_b(_reg2, _filter2);
+ _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3);
+ return __lsx_vsadd_h(_vec0, _vec1);
+}
+
+static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1,
+ __m128i _mask0, __m128i _mask1,
+ __m128i _mask2, __m128i _mask3,
+ __m128i _filt_h0, __m128i _filt_h1,
+ __m128i _filt_h2, __m128i _filt_h3) {
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+ __m128i _out;
+
+ DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,
+ _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);
+ _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1,
+ _filt_h2, _filt_h3);
+ _out = __lsx_vsrari_h(_out, FILTER_BITS);
+ return __lsx_vsat_h(_out, 7);
+}
+
+static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask,
+ __m128i coeff) {
+ __m128i tmp0_m, tmp1_m;
+
+ tmp0_m = __lsx_vshuf_b(in1, in0, mask);
+ tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);
+ return __lsx_vsrari_h(tmp1_m, FILTER_BITS);
+}
+
+#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
+ do { \
+ _src0 = __lsx_vld(_src, 0); \
+ _src += _stride; \
+ _src1 = __lsx_vld(_src, 0); \
+ _src += _stride; \
+ _src2 = __lsx_vld(_src, 0); \
+ _src += _stride; \
+ _src3 = __lsx_vld(_src, 0); \
+ } while (0)
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \
+ _mask2, _mask3, _filter0, _filter1, \
+ _filter2, _filter3, _out0, _out1) \
+ do { \
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
+ __m128i _reg0, _reg1, _reg2, _reg3; \
+ \
+ DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0, \
+ _tmp0, _tmp1); \
+ DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \
+ DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1, \
+ _tmp2, _tmp3); \
+ DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3, \
+ _filter1, _reg0, _reg1); \
+ DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2, \
+ _tmp4, _tmp5); \
+ DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \
+ DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3, \
+ _tmp6, _tmp7); \
+ DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \
+ _filter3, _reg2, _reg3); \
+ DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \
+ } while (0)
+
+#define HORIZ_8TAP_8WID_4VECS_FILT( \
+ _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, \
+ _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3) \
+ do { \
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
+ __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \
+ \
+ DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, \
+ _src2, _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, \
+ _tmp3); \
+ DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2, \
+ _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3); \
+ DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, \
+ _src2, _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, \
+ _tmp3); \
+ DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2, \
+ _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7); \
+ DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, \
+ _src2, _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, \
+ _tmp7); \
+ DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5, \
+ _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \
+ _reg1, _reg2, _reg3); \
+ DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, \
+ _src2, _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, \
+ _tmp7); \
+ DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5, \
+ _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \
+ _reg5, _reg6, _reg7); \
+ DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \
+ _reg7, _out0, _out1, _out2, _out3); \
+ } while (0)
+
+#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \
+ do { \
+ __m128i tmp0_m, tmp1_m; \
+ \
+ DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \
+ __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \
+ pdst += stride; \
+ __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \
+ pdst += stride; \
+ __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \
+ pdst += stride; \
+ __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \
+ } while (0)
+
+#endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_