diff options
Diffstat (limited to 'media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c')
-rw-r--r-- | media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c b/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c new file mode 100644 index 0000000000..a08d4d3f63 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdint.h> +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ + \ + DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ + DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _t2 = __lsx_vilvl_h(_s3, _s2); \ + _t3 = __lsx_vilvh_h(_s3, _s2); \ + DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ + DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ + } + +#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \ + { \ + __m128i tmp0_m, tmp1_m, tmp2_m; \ + \ + tmp0_m = __lsx_vreplvei_h(coeff, val0); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff, val1, coeff, val2, tmp1_m, tmp2_m); \ + DUP2_ARG2(__lsx_vpackev_h, tmp1_m, tmp0_m, tmp0_m, tmp2_m, const1, \ + const2); \ + } + +#define RET_1_IF_NZERO_H(_in) \ + ({ \ + __m128i tmp_m; \ + __m128i one_m = __lsx_vldi(0x401); \ + __m128i max_m = __lsx_vldi(0xFF); \ + \ + tmp_m = __lsx_vseqi_h(_in, 0); \ + tmp_m = __lsx_vxor_v(tmp_m, max_m); \ + tmp_m = __lsx_vand_v(tmp_m, one_m); \ + \ + tmp_m; \ + }) + +void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { + __m128i in0, in1, in2, in3; + __m128i tmp0, tmp1, tmp2, tmp3, const0, const1; + __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c }; + __m128i out0, out1, out2, out3; + __m128i zero = __lsx_vldi(0); + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2); + in3 = __lsx_vldx(input, pitch3); + + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3); + DUP4_ARG2(__lsx_vslli_h, tmp0, 3, tmp1, 3, in1, 3, in3, 3, tmp0, tmp1, in1, + in3); + in0 = __lsx_vadd_h(tmp0, tmp1); + in2 = __lsx_vsub_h(tmp0, tmp1); + SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); + tmp0 = __lsx_vilvl_h(in3, in1); + in1 = __lsx_vreplvei_h(coeff, 3); + out0 = __lsx_vpackev_h(zero, in1); + coeff = __lsx_vilvl_h(zero, coeff); + out1 = __lsx_vreplvei_w(coeff, 0); + DUP2_ARG3(__lsx_vdp2add_w_h, out0, tmp0, const0, out1, tmp0, const1, out0, + out1); + DUP2_ARG3(__lsx_vsrani_h_w, out0, out0, 12, out1, out1, 12, in1, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3); + tmp2 = __lsx_vadd_h(tmp0, tmp1); + tmp3 = __lsx_vsub_h(tmp0, tmp1); + DUP2_ARG2(__lsx_vaddi_hu, tmp2, 7, tmp3, 7, in0, in2); + DUP2_ARG2(__lsx_vsrai_h, in0, 4, in2, 4, in0, in2); + DUP2_ARG2(__lsx_vilvl_h, zero, in0, zero, in2, out0, out2); + tmp1 = RET_1_IF_NZERO_H(in3); + DUP2_ARG2(__lsx_vilvl_h, zero, tmp1, in3, in1, tmp1, tmp0); + DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, out3, out1); + out3 = __lsx_vadd_w(out3, out1); + out1 = __lsx_vreplvei_w(coeff, 1); + DUP2_ARG3(__lsx_vdp2add_w_h, out1, tmp0, const0, out3, tmp0, const1, out1, + out3); + DUP2_ARG2(__lsx_vsrai_w, out1, 16, out3, 16, out1, out3); + out1 = __lsx_vadd_w(out1, tmp1); + DUP2_ARG2(__lsx_vpickev_h, out1, out0, out3, out2, in0, in2); + __lsx_vst(in0, output, 0); + __lsx_vst(in2, output, 16); +} + +void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { + __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1; + __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w; + __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c }; + __m128i zero = __lsx_vldi(0); + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2); + in3 = __lsx_vldx(input, pitch3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3); + DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1, + in1, in3); + in0 = __lsx_vadd_h(temp0, temp1); + in2 = __lsx_vsub_h(temp0, temp1); + SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); + temp0 = __lsx_vreplvei_h(coeff, 3); + vec1_w = __lsx_vpackev_h(zero, temp0); + coeff = __lsx_vilvh_h(zero, coeff); + vec3_w = __lsx_vreplvei_w(coeff, 0); + tmp1 = __lsx_vilvl_h(in3, in1); + tmp0 = __lsx_vilvh_h(in3, in1); + vec0_w = vec1_w; + vec2_w = vec3_w; + DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1, + vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w, + vec3_w); + DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3); + in0 = __lsx_vadd_h(temp0, temp1); + in0 = __lsx_vaddi_hu(in0, 7); + in2 = __lsx_vsub_h(temp0, temp1); + in2 = __lsx_vaddi_hu(in2, 7); + in0 = __lsx_vsrai_h(in0, 4); + in2 = __lsx_vsrai_h(in2, 4); + DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w); + vec3_w = __lsx_vadd_w(vec3_w, vec1_w); + vec1_w = __lsx_vreplvei_w(coeff, 1); + const0 = RET_1_IF_NZERO_H(in3); + tmp1 = __lsx_vilvl_h(in3, in1); + tmp0 = __lsx_vilvh_h(in3, in1); + vec0_w = vec1_w; + vec2_w = vec3_w; + DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1, + vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w, + vec3_w); + DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3); + in1 = __lsx_vadd_h(in1, const0); + DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1); + __lsx_vst(temp0, output, 0); + __lsx_vst(temp1, output, 16); + + DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2); + __lsx_vst(in0, output, 32); + __lsx_vst(in2, output, 48); +} |