/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/mem.h" #if HAVE_DSPR2 void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { int x, y; (void)filter_x; (void)filter_x_stride; (void)filter_y; (void)filter_y_stride; /* prefetch data to cache memory */ prefetch_load(src); prefetch_load(src + 32); prefetch_store(dst); switch (w) { case 4: { uint32_t tp1; /* 1 word storage */ for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); __asm__ __volatile__( "ulw %[tp1], (%[src]) \n\t" "sw %[tp1], (%[dst]) \n\t" /* store */ : [tp1] "=&r"(tp1) : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; } } break; case 8: { uint32_t tp1, tp2; /* 2 word storage */ for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" "sw %[tp1], 0(%[dst]) \n\t" /* store */ "sw %[tp2], 4(%[dst]) \n\t" /* store */ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; } } break; case 16: { uint32_t tp1, tp2, tp3, tp4; /* 4 word storage */ for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp3], 8(%[src]) \n\t" "ulw %[tp4], 12(%[src]) \n\t" "sw %[tp1], 0(%[dst]) \n\t" /* store */ "sw %[tp2], 4(%[dst]) \n\t" /* store */ "sw %[tp3], 8(%[dst]) \n\t" /* store */ "sw %[tp4], 12(%[dst]) \n\t" /* store */ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [tp4] "=&r"(tp4) : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; } } break; case 32: { uint32_t tp1, tp2, tp3, tp4; uint32_t tp5, tp6, tp7, tp8; /* 8 word storage */ for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp3], 8(%[src]) \n\t" "ulw %[tp4], 12(%[src]) \n\t" "ulw %[tp5], 16(%[src]) \n\t" "ulw %[tp6], 20(%[src]) \n\t" "ulw %[tp7], 24(%[src]) \n\t" "ulw %[tp8], 28(%[src]) \n\t" "sw %[tp1], 0(%[dst]) \n\t" /* store */ "sw %[tp2], 4(%[dst]) \n\t" /* store */ "sw %[tp3], 8(%[dst]) \n\t" /* store */ "sw %[tp4], 12(%[dst]) \n\t" /* store */ "sw %[tp5], 16(%[dst]) \n\t" /* store */ "sw %[tp6], 20(%[dst]) \n\t" /* store */ "sw %[tp7], 24(%[dst]) \n\t" /* store */ "sw %[tp8], 28(%[dst]) \n\t" /* store */ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; } } break; case 64: { uint32_t tp1, tp2, tp3, tp4; uint32_t tp5, tp6, tp7, tp8; prefetch_load(src + 64); prefetch_store(dst + 32); /* 16 word storage */ for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 64); prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride + 32); __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp3], 8(%[src]) \n\t" "ulw %[tp4], 12(%[src]) \n\t" "ulw %[tp5], 16(%[src]) \n\t" "ulw %[tp6], 20(%[src]) \n\t" "ulw %[tp7], 24(%[src]) \n\t" "ulw %[tp8], 28(%[src]) \n\t" "sw %[tp1], 0(%[dst]) \n\t" /* store */ "sw %[tp2], 4(%[dst]) \n\t" /* store */ "sw %[tp3], 8(%[dst]) \n\t" /* store */ "sw %[tp4], 12(%[dst]) \n\t" /* store */ "sw %[tp5], 16(%[dst]) \n\t" /* store */ "sw %[tp6], 20(%[dst]) \n\t" /* store */ "sw %[tp7], 24(%[dst]) \n\t" /* store */ "sw %[tp8], 28(%[dst]) \n\t" /* store */ "ulw %[tp1], 32(%[src]) \n\t" "ulw %[tp2], 36(%[src]) \n\t" "ulw %[tp3], 40(%[src]) \n\t" "ulw %[tp4], 44(%[src]) \n\t" "ulw %[tp5], 48(%[src]) \n\t" "ulw %[tp6], 52(%[src]) \n\t" "ulw %[tp7], 56(%[src]) \n\t" "ulw %[tp8], 60(%[src]) \n\t" "sw %[tp1], 32(%[dst]) \n\t" /* store */ "sw %[tp2], 36(%[dst]) \n\t" /* store */ "sw %[tp3], 40(%[dst]) \n\t" /* store */ "sw %[tp4], 44(%[dst]) \n\t" /* store */ "sw %[tp5], 48(%[dst]) \n\t" /* store */ "sw %[tp6], 52(%[dst]) \n\t" /* store */ "sw %[tp7], 56(%[dst]) \n\t" /* store */ "sw %[tp8], 60(%[dst]) \n\t" /* store */ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; } } break; default: for (y = h; y--;) { for (x = 0; x < w; ++x) { dst[x] = src[x]; } src += src_stride; dst += dst_stride; } break; } } #endif