/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <assert.h>
#include <stdio.h>

#include "config/aom_dsp_rtcd.h"

#include "aom_dsp/mips/convolve_common_dspr2.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_ports/mem.h"

#if HAVE_DSPR2
void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int filter_x_stride,
                             const int16_t *filter_y, int filter_y_stride,
                             int w, int h) {
  int x, y;

  (void)filter_x;
  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;

  /* prefetch data to cache memory */
  prefetch_load(src);
  prefetch_load(src + 32);
  prefetch_store(dst);

  switch (w) {
    case 4: {
      uint32_t tp1;

      /* 1 word storage */
      for (y = h; y--;) {
        prefetch_load(src + src_stride);
        prefetch_load(src + src_stride + 32);
        prefetch_store(dst + dst_stride);

        __asm__ __volatile__(
            "ulw              %[tp1],         (%[src])      \n\t"
            "sw               %[tp1],         (%[dst])      \n\t" /* store */

            : [tp1] "=&r"(tp1)
            : [src] "r"(src), [dst] "r"(dst));

        src += src_stride;
        dst += dst_stride;
      }
    } break;
    case 8: {
      uint32_t tp1, tp2;

      /* 2 word storage */
      for (y = h; y--;) {
        prefetch_load(src + src_stride);
        prefetch_load(src + src_stride + 32);
        prefetch_store(dst + dst_stride);

        __asm__ __volatile__(
            "ulw              %[tp1],         0(%[src])      \n\t"
            "ulw              %[tp2],         4(%[src])      \n\t"
            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
            "sw               %[tp2],         4(%[dst])      \n\t" /* store */

            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
            : [src] "r"(src), [dst] "r"(dst));

        src += src_stride;
        dst += dst_stride;
      }
    } break;
    case 16: {
      uint32_t tp1, tp2, tp3, tp4;

      /* 4 word storage */
      for (y = h; y--;) {
        prefetch_load(src + src_stride);
        prefetch_load(src + src_stride + 32);
        prefetch_store(dst + dst_stride);

        __asm__ __volatile__(
            "ulw              %[tp1],         0(%[src])      \n\t"
            "ulw              %[tp2],         4(%[src])      \n\t"
            "ulw              %[tp3],         8(%[src])      \n\t"
            "ulw              %[tp4],         12(%[src])     \n\t"

            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
            "sw               %[tp4],         12(%[dst])     \n\t" /* store */

            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
              [tp4] "=&r"(tp4)
            : [src] "r"(src), [dst] "r"(dst));

        src += src_stride;
        dst += dst_stride;
      }
    } break;
    case 32: {
      uint32_t tp1, tp2, tp3, tp4;
      uint32_t tp5, tp6, tp7, tp8;

      /* 8 word storage */
      for (y = h; y--;) {
        prefetch_load(src + src_stride);
        prefetch_load(src + src_stride + 32);
        prefetch_store(dst + dst_stride);

        __asm__ __volatile__(
            "ulw              %[tp1],         0(%[src])      \n\t"
            "ulw              %[tp2],         4(%[src])      \n\t"
            "ulw              %[tp3],         8(%[src])      \n\t"
            "ulw              %[tp4],         12(%[src])     \n\t"
            "ulw              %[tp5],         16(%[src])     \n\t"
            "ulw              %[tp6],         20(%[src])     \n\t"
            "ulw              %[tp7],         24(%[src])     \n\t"
            "ulw              %[tp8],         28(%[src])     \n\t"

            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
            "sw               %[tp8],         28(%[dst])     \n\t" /* store */

            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
            : [src] "r"(src), [dst] "r"(dst));

        src += src_stride;
        dst += dst_stride;
      }
    } break;
    case 64: {
      uint32_t tp1, tp2, tp3, tp4;
      uint32_t tp5, tp6, tp7, tp8;

      prefetch_load(src + 64);
      prefetch_store(dst + 32);

      /* 16 word storage */
      for (y = h; y--;) {
        prefetch_load(src + src_stride);
        prefetch_load(src + src_stride + 32);
        prefetch_load(src + src_stride + 64);
        prefetch_store(dst + dst_stride);
        prefetch_store(dst + dst_stride + 32);

        __asm__ __volatile__(
            "ulw              %[tp1],         0(%[src])      \n\t"
            "ulw              %[tp2],         4(%[src])      \n\t"
            "ulw              %[tp3],         8(%[src])      \n\t"
            "ulw              %[tp4],         12(%[src])     \n\t"
            "ulw              %[tp5],         16(%[src])     \n\t"
            "ulw              %[tp6],         20(%[src])     \n\t"
            "ulw              %[tp7],         24(%[src])     \n\t"
            "ulw              %[tp8],         28(%[src])     \n\t"

            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
            "sw               %[tp8],         28(%[dst])     \n\t" /* store */

            "ulw              %[tp1],         32(%[src])     \n\t"
            "ulw              %[tp2],         36(%[src])     \n\t"
            "ulw              %[tp3],         40(%[src])     \n\t"
            "ulw              %[tp4],         44(%[src])     \n\t"
            "ulw              %[tp5],         48(%[src])     \n\t"
            "ulw              %[tp6],         52(%[src])     \n\t"
            "ulw              %[tp7],         56(%[src])     \n\t"
            "ulw              %[tp8],         60(%[src])     \n\t"

            "sw               %[tp1],         32(%[dst])     \n\t" /* store */
            "sw               %[tp2],         36(%[dst])     \n\t" /* store */
            "sw               %[tp3],         40(%[dst])     \n\t" /* store */
            "sw               %[tp4],         44(%[dst])     \n\t" /* store */
            "sw               %[tp5],         48(%[dst])     \n\t" /* store */
            "sw               %[tp6],         52(%[dst])     \n\t" /* store */
            "sw               %[tp7],         56(%[dst])     \n\t" /* store */
            "sw               %[tp8],         60(%[dst])     \n\t" /* store */

            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
            : [src] "r"(src), [dst] "r"(dst));

        src += src_stride;
        dst += dst_stride;
      }
    } break;
    default:
      for (y = h; y--;) {
        for (x = 0; x < w; ++x) {
          dst[x] = src[x];
        }

        src += src_stride;
        dst += dst_stride;
      }
      break;
  }
}
#endif