diff options
Diffstat (limited to 'media/libvpx/libvpx/vp8/common/mips/mmi')
6 files changed, 2476 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c new file mode 100644 index 0000000000..86a32aa9ef --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/asmdefs_mmi.h" + +#define COPY_MEM_16X2 \ + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \ + "ldl %[tmp0], 0x0f(%[src]) \n\t" \ + "ldr %[tmp0], 0x08(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \ + "sdl %[tmp0], 0x0f(%[dst]) \n\t" \ + "sdr %[tmp0], 0x08(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "ldl %[tmp1], 0x0f(%[src]) \n\t" \ + "ldr %[tmp1], 0x08(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "gssdlc1 %[ftmp1], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp1], 0x00(%[dst]) \n\t" \ + "sdl %[tmp1], 0x0f(%[dst]) \n\t" \ + "sdr %[tmp1], 0x08(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + +#define COPY_MEM_8X2 \ + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "ldl %[tmp0], 0x07(%[src]) \n\t" \ + "ldr %[tmp0], 0x00(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + \ + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) \ + "sdl %[tmp0], 0x07(%[dst]) \n\t" \ + "sdr %[tmp0], 0x00(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + +void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + double ftmp[2]; + uint64_t tmp[2]; + uint8_t loop_count = 4; + + /* clang-format off */ + __asm__ volatile ( + "1: \n\t" + COPY_MEM_16X2 + COPY_MEM_16X2 + MMI_ADDIU(%[loop_count], %[loop_count], -0x01) + "bnez %[loop_count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [loop_count]"+&r"(loop_count), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + double ftmp[2]; + uint64_t tmp[1]; + uint8_t loop_count = 4; + + /* clang-format off */ + __asm__ volatile ( + "1: \n\t" + COPY_MEM_8X2 + MMI_ADDIU(%[loop_count], %[loop_count], -0x01) + "bnez %[loop_count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), [loop_count]"+&r"(loop_count), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + double ftmp[2]; + uint64_t tmp[1]; + + /* clang-format off */ + __asm__ volatile ( + COPY_MEM_8X2 + COPY_MEM_8X2 + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c new file mode 100644 index 0000000000..b9330a6663 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vp8_dequantize_b_mmi(BLOCKD *d, int16_t *DQC) { + double ftmp[8]; + + __asm__ volatile( + "gsldlc1 %[ftmp0], 0x07(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp3], 0x1f(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp3], 0x18(%[qcoeff]) \n\t" + + "gsldlc1 %[ftmp4], 0x07(%[DQC]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[DQC]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[DQC]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[DQC]) \n\t" + "gsldlc1 %[ftmp6], 0x17(%[DQC]) \n\t" + "gsldrc1 %[ftmp6], 0x10(%[DQC]) \n\t" + "gsldlc1 %[ftmp7], 0x1f(%[DQC]) \n\t" + "gsldrc1 %[ftmp7], 0x18(%[DQC]) \n\t" + + "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp3], 0x1f(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp3], 0x18(%[dqcoeff]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]) + : [dqcoeff] "r"(d->dqcoeff), [qcoeff] "r"(d->qcoeff), [DQC] "r"(DQC) + : "memory"); +} + +void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest, + int stride) { + double ftmp[8]; + + __asm__ volatile( + "gsldlc1 %[ftmp0], 0x07(%[dq]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[dq]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[dq]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[dq]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[dq]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[dq]) \n\t" + "gsldlc1 %[ftmp3], 0x1f(%[dq]) \n\t" + "gsldrc1 %[ftmp3], 0x18(%[dq]) \n\t" + + "gsldlc1 %[ftmp4], 0x07(%[input]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[input]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[input]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[input]) \n\t" + "gsldlc1 %[ftmp6], 0x17(%[input]) \n\t" + "gsldrc1 %[ftmp6], 0x10(%[input]) \n\t" + "gsldlc1 %[ftmp7], 0x1f(%[input]) \n\t" + "gsldrc1 %[ftmp7], 0x18(%[input]) \n\t" + + "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[input]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[input]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[input]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[input]) \n\t" + "gssdlc1 %[ftmp3], 0x1f(%[input]) \n\t" + "gssdrc1 %[ftmp3], 0x18(%[input]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]) + : [dq] "r"(dq), [input] "r"(input) + : "memory"); + + vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride); + + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t" + "sdl $0, 0x0f(%[input]) \n\t" + "sdr $0, 0x08(%[input]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[input]) \n\t" + "sdl $0, 0x1f(%[input]) \n\t" + "sdr $0, 0x18(%[input]) \n\t" + : [ftmp0] "=&f"(ftmp[0]) + : [input] "r"(input) + : "memory"); +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c new file mode 100644 index 0000000000..4fd6854c52 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst, + int stride, char *eobs) { + int i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst += 4; + } + + dst += 4 * stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst_u, + uint8_t *dst_v, int stride, char *eobs) { + int i, j; + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst_u, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_u, stride, dst_u, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst_u += 4; + } + + dst_u += 4 * stride - 8; + } + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst_v, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_v, stride, dst_v, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst_v += 4; + } + + dst_v += 4 * stride - 8; + } +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c new file mode 100644 index 0000000000..a35689dd30 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +#define TRANSPOSE_4H \ + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ + MMI_LI(%[tmp0], 0x93) \ + "mtc1 %[tmp0], %[ftmp10] \n\t" \ + "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ + "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ + "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + +void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + double ftmp[12]; + uint64_t tmp[1]; + double ff_ph_04, ff_ph_4e7b, ff_ph_22a3; + + __asm__ volatile ( + "dli %[tmp0], 0x0004000400040004 \n\t" + "dmtc1 %[tmp0], %[ff_ph_04] \n\t" + "dli %[tmp0], 0x4e7b4e7b4e7b4e7b \n\t" + "dmtc1 %[tmp0], %[ff_ph_4e7b] \n\t" + "dli %[tmp0], 0x22a322a322a322a3 \n\t" + "dmtc1 %[tmp0], %[ff_ph_22a3] \n\t" + MMI_LI(%[tmp0], 0x02) + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t" + "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t" + "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t" + + // ip[0...3] + ip[8...11] + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // ip[0...3] - ip[8...11] + "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + // (ip[12...15] * sinpi8sqrt2) >> 16 + "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" + "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t" + // (ip[ 4... 7] * sinpi8sqrt2) >> 16 + "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" + "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t" + // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16) + "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t" + // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16) + "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t" + "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + + TRANSPOSE_4H + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // b + "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + // c + "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" + "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" + "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t" + "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + // d + "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" + "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" + "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t" + "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" + + MMI_LI(%[tmp0], 0x03) + "mtc1 %[tmp0], %[ftmp11] \n\t" + // a + d + "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_ph_04] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + // b + c + "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_ph_04] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + // b - c + "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_ph_04] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + // a - d + "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_ph_04] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + TRANSPOSE_4H +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp5] \n\t" +#else + "gslwlc1 %[ftmp5], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp5], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp6] \n\t" +#else + "gslwlc1 %[ftmp6], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp2], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp2], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" +#else + "gslwlc1 %[ftmp7], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "gsswlc1 %[ftmp3], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp3], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" +#else + "gslwlc1 %[ftmp8], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp8], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "gsswlc1 %[ftmp4], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp4], 0x00(%[dst_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), + [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), + [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr), + [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04), + [ff_ph_22a3]"=&f"(ff_ph_22a3) + : [ip]"r"(input), + [pred_stride]"r"((mips_reg)pred_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); +} + +void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int a0 = ((input_dc + 4) >> 3); + double a1, ftmp[5]; + int low32; + + __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[a0], %[a1] \n\t" + "pshufh %[a1], %[a1], %[ftmp0] \n\t" + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32), + [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1) + : [dst_stride]"r"((mips_reg)dst_stride), + [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0) + : "memory" + ); +} + +void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) { + int i; + int16_t output[16]; + double ff_ph_03, ftmp[12]; + uint64_t tmp[1]; + + __asm__ volatile ( + "dli %[tmp0], 0x0003000300030003 \n\t" + "dmtc1 %[tmp0], %[ff_ph_03] \n\t" + MMI_LI(%[tmp0], 0x03) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t" + "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t" + "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t" + "paddh %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "psubh %[ftmp8], %[ftmp3], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp2], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + TRANSPOSE_4H + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + // d + "psubh %[ftmp6], %[ftmp1], %[ftmp4] \n\t" + // b + "paddh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + // c + "psubh %[ftmp8], %[ftmp2], %[ftmp3] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp3], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ff_ph_03] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_ph_03] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_ph_03] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_ph_03] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + TRANSPOSE_4H + "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t" + "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t" + "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), + [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03) + : [ip]"r"(input), [op]"r"(output) + : "memory" + ); + + for (i = 0; i < 16; i++) { + mb_dqcoeff[i * 16] = output[i]; + } +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c new file mode 100644 index 0000000000..a07a7e3b41 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c @@ -0,0 +1,1415 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/onyxc_int.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vp8_loop_filter_horizontal_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + uint64_t tmp[1]; + mips_reg addr[2]; + double ftmp[12]; + double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03; + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gsldlc1 %[ftmp1], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t" + "psubusb %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp4], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp9], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp1], %[ftmp11], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp8], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[blimit]) \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + + "gsldlc1 %[ftmp10], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[thresh]) \n\t" + "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "psubusb %[ftmp2], %[ftmp11], %[ftmp10] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + + "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "psubsb %[ftmp3], %[ftmp6], %[ftmp5] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + + "paddsb %[ftmp8], %[ftmp2], %[ff_pb_03] \n\t" + "paddsb %[ftmp9], %[ftmp2], %[ff_pb_04] \n\t" + + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" + "packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "punpckhbh %[ftmp9], %[ftmp11], %[ftmp9] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "paddsh %[ftmp11], %[ftmp0], %[ff_ph_01] \n\t" + "packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t" + + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "pandn %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp5], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t" + + "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + + "addiu %[count], %[count], -0x01 \n\t" + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_fe]"=&f"(ff_pb_fe), + [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_04]"=&f"(ff_pb_04), + [ff_pb_03]"=&f"(ff_pb_03) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)) + : "memory" + ); + /* clang-format on */ +} + +void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count) { + uint64_t tmp[1]; + mips_reg addr[2]; + double ftmp[13]; + double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) + + "1: \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SLL (%[tmp0], %[src_pixel_step], 0x01) + MMI_ADDU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_ADDU(%[addr1], %[addr0], %[tmp0]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp1], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t" + + "gsldlc1 %[ftmp11], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t" + "punpcklbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp4], %[ftmp11], %[ftmp12] \n\t" + + "punpcklhw %[ftmp5], %[ftmp4], %[ftmp2] \n\t" + "punpckhhw %[ftmp6], %[ftmp4], %[ftmp2] \n\t" + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp1] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x01) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp9], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp10], %[ftmp11], %[ftmp12] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[addr0], %[tmp0]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp0], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + + "punpcklhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t" + "punpckhhw %[ftmp2], %[ftmp11], %[ftmp10] \n\t" + "punpcklhw %[ftmp3], %[ftmp0], %[ftmp9] \n\t" + "punpckhhw %[ftmp4], %[ftmp0], %[ftmp9] \n\t" + + /* ftmp9:q0 ftmp10:q1 */ + "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t" + "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t" + /* ftmp11:q2 ftmp12:q3 */ + "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t" + "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t" + /* ftmp1:p3 ftmp2:p2 */ + "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t" + /* ftmp5:p1 ftmp6:p0 */ + "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[limit]) \n\t" + + /* abs (q3-q2) */ + "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + "psubusb %[ftmp0], %[ftmp7], %[ftmp8] \n\t" + /* abs (q2-q1) */ + "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp3: abs(q1-q0) */ + "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp4: abs(p1-p0) */ + "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp4], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p2-p1) */ + "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p3-p2) */ + "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[blimit]) \n\t" + + /* abs (p0-q0) */ + "pasubub %[ftmp11], %[ftmp9], %[ftmp6] \n\t" + "paddusb %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + /* abs (p1-q1) */ + "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" + "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp1] \n\t" + "psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t" + "paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp0:mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[thresh]) \n\t" + + /* ftmp3: abs(q1-q0) ftmp4: abs(p1-p0) */ + "psubusb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "psubusb %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "por %[ftmp2], %[ftmp4], %[ftmp3] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp1:hev */ + "pxor %[ftmp1], %[ftmp2], %[ftmp1] \n\t" + + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + "psubsb %[ftmp2], %[ftmp5], %[ftmp10] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "psubsb %[ftmp3], %[ftmp9], %[ftmp6] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + /* ftmp2:filter_value */ + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + + "paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t" + "paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp12], %[ftmp0], %[ftmp8] \n\t" + + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp11] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp11], %[ftmp0], %[ftmp8] \n\t" + + "psubsb %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "paddsb %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t" + + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t" + "pandn %[ftmp2], %[ftmp1], %[ftmp2] \n\t" + "psubsb %[ftmp10], %[ftmp10], %[ftmp2] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + /* ftmp5: *op1 ; ftmp6: *op0 */ + "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + /* ftmp9: *oq0 ; ftmp10: *oq1 */ + "punpcklbh %[ftmp4], %[ftmp9], %[ftmp10] \n\t" + "punpckhbh %[ftmp3], %[ftmp9], %[ftmp10] \n\t" + "punpckhhw %[ftmp6], %[ftmp2], %[ftmp4] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t" + + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "ssrld %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[addr0], %[tmp0]) + "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x01) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" + + "ssrld %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" + "gsswlc1 %[ftmp1], 0x05(%[src_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x02(%[src_ptr]) \n\t" + + "ssrld %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "gsswlc1 %[ftmp1], 0x05(%[addr0]) \n\t" + "gsswrc1 %[ftmp1], 0x02(%[addr0]) \n\t" + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step]) + "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" + + "ssrld %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + MMI_ADDU(%[addr1], %[addr0], %[tmp0]) + "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" + + MMI_ADDIU(%[count], %[count], -0x01) + MMI_SLL(%[tmp0], %[src_pixel_step], 0x03) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_03]"=&f"(ff_pb_03), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_fe]"=&f"(ff_pb_fe) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step) + : "memory" + ); + /* clang-format on */ +} + +/* clang-format off */ +#define VP8_MBLOOP_HPSRAB \ + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" \ + "psrah %[ftmp10], %[ftmp10], %[ftmp9] \n\t" \ + "psrah %[ftmp11], %[ftmp11], %[ftmp9] \n\t" \ + "packsshb %[ftmp0], %[ftmp10], %[ftmp11] \n\t" + +#define VP8_MBLOOP_HPSRAB_ADD(reg) \ + "punpcklbh %[ftmp1], %[ftmp0], %[ftmp12] \n\t" \ + "punpckhbh %[ftmp2], %[ftmp0], %[ftmp12] \n\t" \ + "pmulhh %[ftmp1], %[ftmp1], " #reg " \n\t" \ + "pmulhh %[ftmp2], %[ftmp2], " #reg " \n\t" \ + "paddh %[ftmp1], %[ftmp1], %[ff_ph_003f] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_003f] \n\t" \ + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \ + "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" \ + "packsshb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" +/* clang-format on */ + +void vp8_mbloop_filter_horizontal_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + uint64_t tmp[1]; + double ftmp[13]; + double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900, + ff_ph_1200, ff_ph_1b00; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "dli %[tmp0], 0x003f003f003f003f \n\t" + "dmtc1 %[tmp0], %[ff_ph_003f] \n\t" + "dli %[tmp0], 0x0900090009000900 \n\t" + "dmtc1 %[tmp0], %[ff_ph_0900] \n\t" + "dli %[tmp0], 0x1200120012001200 \n\t" + "dmtc1 %[tmp0], %[ff_ph_1200] \n\t" + "dli %[tmp0], 0x1b001b001b001b00 \n\t" + "dmtc1 %[tmp0], %[ff_ph_1b00] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + "1: \n\t" + "gsldlc1 %[ftmp9], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[limit]) \n\t" + /* ftmp1: p3 */ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + /* ftmp3: p2 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + /* ftmp4: p1 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + /* ftmp5: p0 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + /* ftmp6: q0 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + /* ftmp7: q1 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + /* ftmp8: q2 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + /* ftmp2: q3 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp2], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src_ptr]) \n\t" + + "gsldlc1 %[ftmp12], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[blimit]) \n\t" + + "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t" + "psubusb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp10], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp1], %[ftmp11], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + /* ftmp0: mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + + "gsldlc1 %[ftmp9], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[thresh]) \n\t" + "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp2], %[ftmp11], %[ftmp9] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + /* ftmp1: hev */ + "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "psubsb %[ftmp9], %[ftmp6], %[ftmp5] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t" + VP8_MBLOOP_HPSRAB + "paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddsb %[ftmp0], %[ftmp2], %[ff_pb_04] \n\t" + VP8_MBLOOP_HPSRAB + "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00]) + "psubsb %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + "gssdlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200]) + "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900]) + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "psubsb %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "gssdlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "addiu %[count], %[count], -0x01 \n\t" + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_03]"=&f"(ff_pb_03), + [ff_ph_0900]"=&f"(ff_ph_0900), [ff_ph_1b00]"=&f"(ff_ph_1b00), + [ff_ph_1200]"=&f"(ff_ph_1200), [ff_ph_003f]"=&f"(ff_ph_003f) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step) + : "memory" + ); + /* clang-format on */ +} + +/* clang-format off */ +#define VP8_MBLOOP_VPSRAB_ADDH \ + "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \ + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + +#define VP8_MBLOOP_VPSRAB_ADDT \ + "paddh %[ftmp7], %[ftmp7], %[ff_ph_003f] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_003f] \n\t" \ + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \ + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \ + "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" +/* clang-format on */ + +void vp8_mbloop_filter_vertical_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + mips_reg tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, srct[2]); + double ftmp[14]; + double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x003f003f003f003f \n\t" + "dmtc1 %[tmp0], %[ff_ph_003f] \n\t" + "dli %[tmp0], 0x0900090009000900 \n\t" + "dmtc1 %[tmp0], %[ff_ph_0900] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) + + "1: \n\t" + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t" + + "punpcklhw %[ftmp1], %[ftmp12], %[ftmp10] \n\t" + "punpckhhw %[ftmp2], %[ftmp12], %[ftmp10] \n\t" + "punpcklhw %[ftmp3], %[ftmp11], %[ftmp9] \n\t" + "punpckhhw %[ftmp4], %[ftmp11], %[ftmp9] \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t" + + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp10] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp10] \n\t" + "punpcklhw %[ftmp7], %[ftmp11], %[ftmp9] \n\t" + "punpckhhw %[ftmp8], %[ftmp11], %[ftmp9] \n\t" + + "gsldlc1 %[ftmp13], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp13], 0x00(%[limit]) \n\t" + /* ftmp9:q0 ftmp10:q1 */ + "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t" + "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t" + /* ftmp11:q2 ftmp12:q3 */ + "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t" + "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t" + /* srct[0x00]: q3 */ + "sdc1 %[ftmp12], 0x00(%[srct]) \n\t" + /* ftmp1:p3 ftmp2:p2 */ + "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t" + /* srct[0x08]: p3 */ + "sdc1 %[ftmp1], 0x08(%[srct]) \n\t" + /* ftmp5:p1 ftmp6:p0 */ + "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t" + + /* abs (q3-q2) */ + "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + "psubusb %[ftmp0], %[ftmp7], %[ftmp13] \n\t" + /* abs (q2-q1) */ + "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp3: abs(q1-q0) */ + "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp4: abs(p1-p0) */ + "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp4], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p2-p1) */ + "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p3-p2) */ + "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + + "gsldlc1 %[ftmp13], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp13], 0x00(%[blimit]) \n\t" + "gsldlc1 %[ftmp7], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[thresh]) \n\t" + /* abs (p0-q0) * 2 */ + "pasubub %[ftmp1], %[ftmp9], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* abs (p1-q1) / 2 */ + "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" + "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" + "paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t" + "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + /* ftmp0: mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + + /* abs(p1-p0) - thresh */ + "psubusb %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + /* abs(q1-q0) - thresh */ + "psubusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "por %[ftmp3], %[ftmp4], %[ftmp3] \n\t" + "pcmpeqb %[ftmp3], %[ftmp3], %[ftmp12] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp1: hev */ + "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + + /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */ + "pxor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + + "psubsb %[ftmp3], %[ftmp5], %[ftmp10] \n\t" + "psubsb %[ftmp4], %[ftmp9], %[ftmp6] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + /* filter_value &= mask */ + "pand %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + /* Filter2 = filter_value & hev */ + "pand %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + /* filter_value &= ~hev */ + "pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t" + + "paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "packsshb %[ftmp4], %[ftmp7], %[ftmp8] \n\t" + /* ftmp9: qs0 */ + "psubsb %[ftmp9], %[ftmp9], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ff_pb_03] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp3] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" + /* ftmp6: ps0 */ + "paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" + VP8_MBLOOP_VPSRAB_ADDH + "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp9], %[ftmp3] \n\t" + /* ftmp9: oq0 */ + "pxor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp6], %[ftmp3] \n\t" + /* ftmp6: op0 */ + "pxor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t" + + VP8_MBLOOP_VPSRAB_ADDH + "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp10], %[ftmp3] \n\t" + /* ftmp10: oq1 */ + "pxor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp5], %[ftmp3] \n\t" + /* ftmp5: op1 */ + "pxor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t" + + VP8_MBLOOP_VPSRAB_ADDH + "pmulhh %[ftmp7], %[ftmp7], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ff_ph_0900] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp11], %[ftmp3] \n\t" + /* ftmp11: oq2 */ + "pxor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp2], %[ftmp3] \n\t" + /* ftmp2: op2 */ + "pxor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t" + + "ldc1 %[ftmp12], 0x00(%[srct]) \n\t" + "ldc1 %[ftmp8], 0x08(%[srct]) \n\t" + + "punpcklbh %[ftmp0], %[ftmp8], %[ftmp2] \n\t" + "punpckhbh %[ftmp1], %[ftmp8], %[ftmp2] \n\t" + "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp3], %[ftmp5], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp5], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + "punpckhhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" + + "punpcklbh %[ftmp0], %[ftmp9], %[ftmp10] \n\t" + "punpckhbh %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "punpcklbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" + "punpcklhw %[ftmp8], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp10], %[ftmp1], %[ftmp3] \n\t" + "punpckhhw %[ftmp11], %[ftmp1], %[ftmp3] \n\t" + + "punpcklwd %[ftmp0], %[ftmp7], %[ftmp11] \n\t" + "punpckhwd %[ftmp1], %[ftmp7], %[ftmp11] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp0], %[ftmp6], %[ftmp10] \n\t" + "punpckhwd %[ftmp1], %[ftmp6], %[ftmp10] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp9] \n\t" + "punpckhwd %[ftmp0], %[ftmp5], %[ftmp9] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp1], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp0], %[ftmp4], %[ftmp8] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "addiu %[count], %[count], -0x01 \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x03) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr), + [count]"+&r"(count), + [ff_ph_003f]"=&f"(ff_ph_003f), [ff_ph_0900]"=&f"(ff_ph_0900), + [ff_pb_03]"=&f"(ff_pb_03), [ff_pb_04]"=&f"(ff_pb_04), + [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_fe]"=&f"(ff_pb_fe) + : [limit]"r"(limit), [blimit]"r"(blimit), + [srct]"r"(srct), [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step) + : "memory" + ); + /* clang-format on */ +} + +/* clang-format off */ +#define VP8_SIMPLE_HPSRAB \ + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \ + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \ + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" \ + "psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \ + "psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \ + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" +/* clang-format on */ + +void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit) { + uint64_t tmp[1], count = 2; + mips_reg addr[2]; + double ftmp[12]; + double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0101010101010101 \n\t" + "dmtc1 %[tmp0], %[ff_pb_01] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[blimit]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + "pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t" + "pand %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + "pasubub %[ftmp5], %[ftmp6], %[ftmp0] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + + "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "psubsb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp0], %[ff_pb_80] \n\t" + "psubsb %[ftmp0], %[ftmp3], %[ftmp6] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + + "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" + VP8_SIMPLE_HPSRAB + "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + + "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" + VP8_SIMPLE_HPSRAB + "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp6], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + "addiu %[count], %[count], -0x01 \n\t" + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01) + : [blimit]"r"(blimit), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)) + : "memory" + ); + /* clang-format on */ +} + +void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit) { + uint64_t tmp[1], count = 2; + mips_reg addr[2]; + DECLARE_ALIGNED(8, const uint64_t, srct[2]); + double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x20 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x20 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0101010101010101 \n\t" + "dmtc1 %[tmp0], %[ff_pb_01] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4]) + MMI_SUBU(%[src_ptr], %[src_ptr], 0x02) + + "1: \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "gslwlc1 %[ftmp4], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpckhhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gslwlc1 %[ftmp7], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp7] \n\t" + + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gslwlc1 %[ftmp1], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t" + "punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t" + "pand %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t" + "psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "pasubub %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp7], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[blimit]) \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + + "sdc1 %[ftmp0], 0x00(%[srct]) \n\t" + "sdc1 %[ftmp3], 0x08(%[srct]) \n\t" + + "pxor %[ftmp0], %[ftmp0], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "psubsb %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + + "pxor %[ftmp6], %[ftmp1], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp2], %[ff_pb_80] \n\t" + "psubsb %[ftmp7], %[ftmp3], %[ftmp6] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" + + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t" + "psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" + + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + + "ldc1 %[ftmp0], 0x00(%[srct]) \n\t" + "ldc1 %[ftmp4], 0x08(%[srct]) \n\t" + + "punpckhbh %[ftmp1], %[ftmp0], %[ftmp6] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "punpcklbh %[ftmp2], %[ftmp3], %[ftmp4] \n\t" + "punpckhbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + + "punpckhhw %[ftmp6], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + + "ssrld %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + "ssrld %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[src_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + + "ssrld %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr0]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + + "ssrld %[ftmp5], %[ftmp5], %[ftmp10] \n\t" + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8]) + "addiu %[count], %[count], -0x01 \n\t" + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01) + : [blimit]"r"(blimit), [srct]"r"(srct), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)), + [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)) + : "memory" + ); + /* clang-format on */ +} + +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); +} + +void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride, + blimit); +} + +void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit); +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c new file mode 100644 index 0000000000..b85f73fdff --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/filter.h" +#include "vpx_ports/asmdefs_mmi.h" + +DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = { + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 } +}; + +/* Horizontal filter: pixel_step is 1, output_height and output_width are + the size of horizontal filtering output, output_height is always H + 5 */ +static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp8_filter) { + uint64_t tmp[1]; + double ff_ph_40; +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0040004000400040 \n\t" + "dmtc1 %[tmp0], %[ff_ph_40] \n\t" + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + "pxor %[fzero], %[fzero], %[fzero] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp9], -0x02(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp10], 0x06(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp10], -0x01(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" + + "punpckhbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "punpckhbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "ssrld %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "ssrld %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packushb %[ftmp8], %[ftmp8], %[fzero] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[fzero] \n\t" + "gssdlc1 %[ftmp8], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), + [src_ptr]"+&r"(src_ptr), [ff_ph_40]"=&f"(ff_ph_40) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width) + : "memory" + ); + /* clang-format on */ +} + +/* Horizontal filter: pixel_step is always W */ +static INLINE void vp8_filter_block1dc_v6_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) { + double ff_ph_40; + uint64_t tmp[1]; + mips_reg addr[1]; + +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); + register double ftmp12 asm("$f26"); + register double ftmp13 asm("$f28"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); + register double ftmp12 asm("$f13"); + register double ftmp13 asm("$f14"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0040004000400040 \n\t" + "dmtc1 %[tmp0], %[ff_ph_40] \n\t" + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + "pxor %[fzero], %[fzero], %[fzero] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp13] \n\t" + + /* In order to make full use of memory load delay slot, + * Operation of memory loading and calculating has been rearranged. + */ + "1: \n\t" + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line]) + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp8], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr0]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp9], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp10], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t" + + "pmullh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + + "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp7] \n\t" + + "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" + + "pmullh %[ftmp9], %[ftmp9], %[ftmp4] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp9] \n\t" + + "pmullh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp10] \n\t" + + "pmullh %[ftmp11], %[ftmp11], %[ftmp5] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp11] \n\t" + + "paddsh %[ftmp12], %[ftmp12], %[ff_ph_40] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp13] \n\t" + "packushb %[ftmp12], %[ftmp12], %[fzero] \n\t" + "gsswlc1 %[ftmp12], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp12], 0x00(%[output_ptr]) \n\t" + + MMI_ADDIU(%[output_height], %[output_height], -0x01) + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12), + [ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), + [ff_ph_40]"=&f"(ff_ph_40) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)), + [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)), + [vp8_filter]"r"(vp8_filter), + [output_pitch]"r"((mips_reg)output_pitch) + : "memory" + ); + /* clang-format on */ +} + +/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0}, + function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can + be simplified */ +static INLINE void vp8_filter_block1d_h6_filter0_mmi( + unsigned char *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int output_height, + unsigned int output_width) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format off */ + __asm__ volatile ( + "pxor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + + "punpcklbh %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [output_width]"r"(output_width) + : "memory" + ); + /* clang-format on */ +} + +static INLINE void vp8_filter_block1dc_v6_filter0_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format on */ + __asm__ volatile ( + "pxor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDIU(%[output_height], %[output_height], -0x01) + "packushb %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [output_pitch]"r"((mips_reg)output_pitch) + : "memory" + ); + /* clang-format on */ +} + +#define sixtapNxM(n, m) \ + void vp8_sixtap_predict##n##x##m##_mmi( \ + unsigned char *src_ptr, int src_pixels_per_line, int xoffset, \ + int yoffset, unsigned char *dst_ptr, int dst_pitch) { \ + DECLARE_ALIGNED(16, uint16_t, \ + FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]); \ + const int16_t *HFilter, *VFilter; \ + int i, loop = n / 4; \ + HFilter = vp8_six_tap_mmi[xoffset]; \ + VFilter = vp8_six_tap_mmi[yoffset]; \ + \ + if (xoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_filter0_mmi( \ + src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4, \ + src_pixels_per_line, m + 5, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \ + FData2 + i * 4, src_pixels_per_line, m + 5, \ + n * 2, HFilter); \ + } \ + } \ + if (yoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_filter0_mmi( \ + FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m, \ + dst_pitch, n * 2, VFilter); \ + } \ + } \ + } + +sixtapNxM(4, 4); +sixtapNxM(8, 8); +sixtapNxM(8, 4); +sixtapNxM(16, 16); |