diff options
Diffstat (limited to '')
-rw-r--r-- | media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c | 716 |
1 files changed, 716 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c new file mode 100644 index 0000000000..cb7bca5589 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c @@ -0,0 +1,716 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <string.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vpx_ports/mem.h" + +#define GET_DATA_H_MMI \ + "pmaddhw %[ftmp4], %[ftmp4], %[filter1] \n\t" \ + "pmaddhw %[ftmp5], %[ftmp5], %[filter2] \n\t" \ + "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "punpckhwd %[ftmp5], %[ftmp4], %[ftmp0] \n\t" \ + "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp6], %[filter1] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp7], %[filter2] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \ + "punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \ + "pmaddhw %[ftmp8], %[ftmp8], %[filter1] \n\t" \ + "pmaddhw %[ftmp9], %[ftmp9], %[filter2] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpckhwd %[ftmp9], %[ftmp8], %[ftmp0] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp10], %[ftmp10], %[filter1] \n\t" \ + "pmaddhw %[ftmp11], %[ftmp11], %[filter2] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "punpcklwd %[srch], %[ftmp8], %[ftmp10] \n\t" + +#define GET_DATA_V_MMI \ + "punpcklhw %[srcl], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[srcl], %[srcl], %[filter10] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpckhhw %[srch], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[srch], %[srch], %[filter10] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" + +/* clang-format off */ +#define ROUND_POWER_OF_TWO_MMI \ + /* Add para[0] */ \ + "lw %[tmp0], 0x00(%[para]) \n\t" \ + MMI_MTC1(%[tmp0], %[ftmp6]) \ + "punpcklwd %[ftmp6], %[ftmp6], %[ftmp6] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp6] \n\t" \ + "paddw %[srch], %[srch], %[ftmp6] \n\t" \ + /* Arithmetic right shift para[1] bits */ \ + "lw %[tmp0], 0x04(%[para]) \n\t" \ + MMI_MTC1(%[tmp0], %[ftmp5]) \ + "psraw %[srcl], %[srcl], %[ftmp5] \n\t" \ + "psraw %[srch], %[srch], %[ftmp5] \n\t" +/* clang-format on */ + +#define CLIP_PIXEL_MMI \ + /* Staturated operation */ \ + "packsswh %[srcl], %[srcl], %[srch] \n\t" \ + "packushb %[ftmp12], %[srcl], %[ftmp0] \n\t" + +static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int32_t w, int32_t h) { + const int16_t *filter_x = filter[x0_q4]; + double ftmp[14]; + uint32_t tmp[2]; + uint32_t para[5]; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= SUBPEL_TAPS / 2 - 1; + src_stride -= w; + dst_stride -= w; + (void)x_step_q4; + + /* clang-format off */ + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[filter1], 0x03(%[filter]) \n\t" + "gsldrc1 %[filter1], 0x00(%[filter]) \n\t" + "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t" + "gsldrc1 %[filter2], 0x08(%[filter]) \n\t" + "1: \n\t" + /* Get 8 data per row */ + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t" + "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t" + "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t" + "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_H_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]), + [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]), + [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]), + [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]), + [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]), + [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [width]"+&r"(w), + [dst]"+&r"(dst), [height]"+&r"(h) + : [filter]"r"(filter_x), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int y0_q4, + int y_step_q4, int32_t w, int32_t h) { + const int16_t *filter_y = filter[y0_q4]; + double ftmp[16]; + uint32_t tmp[1]; + uint32_t para[2]; + ptrdiff_t addr = src_stride; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + src_stride -= w; + dst_stride -= w; + (void)y_step_q4; + + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t" + "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t" + "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t" + "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t" + "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t" + "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + /* Get 8 data per column */ + "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t" + MMI_ADDU(%[tmp0], %[src], %[addr]) + "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_V_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + MMI_SUBU(%[width], %[addr], %[src_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]), + [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]), + [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]), + [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]), + [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]), + [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]), + [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h), + [tmp0]"=&r"(tmp[0]) + : [filter]"r"(filter_y), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride), + [addr]"r"((mips_reg)addr) + : "memory" + ); +} + +static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int32_t w, int32_t h) { + const int16_t *filter_x = filter[x0_q4]; + double ftmp[14]; + uint32_t tmp[2]; + uint32_t para[2]; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= SUBPEL_TAPS / 2 - 1; + src_stride -= w; + dst_stride -= w; + (void)x_step_q4; + + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[filter1], 0x03(%[filter]) \n\t" + "gsldrc1 %[filter1], 0x00(%[filter]) \n\t" + "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t" + "gsldrc1 %[filter2], 0x08(%[filter]) \n\t" + "1: \n\t" + /* Get 8 data per row */ + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t" + "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t" + "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t" + "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_H_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp5]) + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]), + [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]), + [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]), + [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]), + [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]), + [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [width]"+&r"(w), + [dst]"+&r"(dst), [height]"+&r"(h) + : [filter]"r"(filter_x), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); +} + +static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int y0_q4, + int y_step_q4, int32_t w, int32_t h) { + const int16_t *filter_y = filter[y0_q4]; + double ftmp[16]; + uint32_t tmp[1]; + uint32_t para[2]; + ptrdiff_t addr = src_stride; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + src_stride -= w; + dst_stride -= w; + (void)y_step_q4; + + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t" + "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t" + "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t" + "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t" + "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t" + "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + /* Get 8 data per column */ + "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t" + MMI_ADDU(%[tmp0], %[src], %[addr]) + "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_V_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp5]) + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + MMI_SUBU(%[width], %[addr], %[src_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]), + [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]), + [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]), + [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]), + [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]), + [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]), + [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h), + [tmp0]"=&r"(tmp[0]) + : [filter]"r"(filter_y), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride), + [addr]"r"((mips_reg)addr) + : "memory" + ); +} + +void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + int x, y; + + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (w & 0x03) { + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + src += src_stride; + dst += dst_stride; + } + } else { + double ftmp[4]; + uint32_t tmp[2]; + src_stride -= w; + dst_stride -= w; + + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp3]) + "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + } +} + +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), + 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, + int32_t y_step_q4, int32_t w, int32_t h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + uint8_t temp[64 * 135]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if (w & 0x03) { + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, + 64, filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + filter, y0_q4, y_step_q4, w, h); + } else { + convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + filter, y0_q4, y_step_q4, w, h); + } +} + +void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)y0_q4; + (void)y_step_q4; + if (w & 0x03) + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); + else + convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); +} + +void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + if (w & 0x03) + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); + else + convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + (void)y0_q4; + (void)y_step_q4; + if (w & 0x03) + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); + else + convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); +} + +void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + (void)x0_q4; + (void)x_step_q4; + if (w & 0x03) + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); + else + convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); + vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); +} |