summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vp8/common
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vp8/common')
-rw-r--r--media/libvpx/libvpx/vp8/common/alloccommon.c187
-rw-r--r--media/libvpx/libvpx/vp8/common/alloccommon.h30
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c85
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h31
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c764
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c52
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c41
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/dequant_idct_neon.c141
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c26
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c295
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c102
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c106
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c274
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c613
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c121
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c1729
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c538
-rw-r--r--media/libvpx/libvpx/vp8/common/blockd.c19
-rw-r--r--media/libvpx/libvpx/vp8/common/blockd.h311
-rw-r--r--media/libvpx/libvpx/vp8/common/coefupdateprobs.h197
-rw-r--r--media/libvpx/libvpx/vp8/common/common.h48
-rw-r--r--media/libvpx/libvpx/vp8/common/context.c398
-rw-r--r--media/libvpx/libvpx/vp8/common/debugmodes.c135
-rw-r--r--media/libvpx/libvpx/vp8/common/default_coef_probs.h160
-rw-r--r--media/libvpx/libvpx/vp8/common/dequantize.c37
-rw-r--r--media/libvpx/libvpx/vp8/common/entropy.c147
-rw-r--r--media/libvpx/libvpx/vp8/common/entropy.h108
-rw-r--r--media/libvpx/libvpx/vp8/common/entropymode.c104
-rw-r--r--media/libvpx/libvpx/vp8/common/entropymode.h88
-rw-r--r--media/libvpx/libvpx/vp8/common/entropymv.c47
-rw-r--r--media/libvpx/libvpx/vp8/common/entropymv.h49
-rw-r--r--media/libvpx/libvpx/vp8/common/extend.c167
-rw-r--r--media/libvpx/libvpx/vp8/common/extend.h32
-rw-r--r--media/libvpx/libvpx/vp8/common/filter.c381
-rw-r--r--media/libvpx/libvpx/vp8/common/filter.h31
-rw-r--r--media/libvpx/libvpx/vp8/common/findnearmv.c159
-rw-r--r--media/libvpx/libvpx/vp8/common/findnearmv.h151
-rw-r--r--media/libvpx/libvpx/vp8/common/generic/systemdependent.c111
-rw-r--r--media/libvpx/libvpx/vp8/common/header.h48
-rw-r--r--media/libvpx/libvpx/vp8/common/idct_blk.c72
-rw-r--r--media/libvpx/libvpx/vp8/common/idctllm.c185
-rw-r--r--media/libvpx/libvpx/vp8/common/invtrans.h57
-rw-r--r--media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c322
-rw-r--r--media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c743
-rw-r--r--media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c1903
-rw-r--r--media/libvpx/libvpx/vp8/common/loopfilter.h101
-rw-r--r--media/libvpx/libvpx/vp8/common/loopfilter_filters.c397
-rw-r--r--media/libvpx/libvpx/vp8/common/mbpitch.c57
-rw-r--r--media/libvpx/libvpx/vp8/common/mfqe.c327
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c29
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c2767
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c76
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c346
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c97
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c2401
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c114
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c115
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c70
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c335
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c1415
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c427
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c797
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/msa/copymem_msa.c62
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c406
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c709
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/msa/mfqe_msa.c139
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c1738
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h1762
-rw-r--r--media/libvpx/libvpx/vp8/common/modecont.c26
-rw-r--r--media/libvpx/libvpx/vp8/common/modecont.h24
-rw-r--r--media/libvpx/libvpx/vp8/common/mv.h33
-rw-r--r--media/libvpx/libvpx/vp8/common/onyx.h277
-rw-r--r--media/libvpx/libvpx/vp8/common/onyxc_int.h177
-rw-r--r--media/libvpx/libvpx/vp8/common/onyxd.h62
-rw-r--r--media/libvpx/libvpx/vp8/common/postproc.c264
-rw-r--r--media/libvpx/libvpx/vp8/common/postproc.h45
-rw-r--r--media/libvpx/libvpx/vp8/common/ppflags.h39
-rw-r--r--media/libvpx/libvpx/vp8/common/quant_common.c130
-rw-r--r--media/libvpx/libvpx/vp8/common/quant_common.h33
-rw-r--r--media/libvpx/libvpx/vp8/common/reconinter.c503
-rw-r--r--media/libvpx/libvpx/vp8/common/reconinter.h36
-rw-r--r--media/libvpx/libvpx/vp8/common/reconintra.c104
-rw-r--r--media/libvpx/libvpx/vp8/common/reconintra.h35
-rw-r--r--media/libvpx/libvpx/vp8/common/reconintra4x4.c75
-rw-r--r--media/libvpx/libvpx/vp8/common/reconintra4x4.h45
-rw-r--r--media/libvpx/libvpx/vp8/common/rtcd.c15
-rw-r--r--media/libvpx/libvpx/vp8/common/rtcd_defs.pl250
-rw-r--r--media/libvpx/libvpx/vp8/common/setupintrarecon.c38
-rw-r--r--media/libvpx/libvpx/vp8/common/setupintrarecon.h40
-rw-r--r--media/libvpx/libvpx/vp8/common/swapyv12buffer.c32
-rw-r--r--media/libvpx/libvpx/vp8/common/swapyv12buffer.h27
-rw-r--r--media/libvpx/libvpx/vp8/common/systemdependent.h27
-rw-r--r--media/libvpx/libvpx/vp8/common/threading.h215
-rw-r--r--media/libvpx/libvpx/vp8/common/treecoder.c102
-rw-r--r--media/libvpx/libvpx/vp8/common/treecoder.h82
-rw-r--r--media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h172
-rw-r--r--media/libvpx/libvpx/vp8/common/vp8_loopfilter.c566
-rw-r--r--media/libvpx/libvpx/vp8/common/vp8_skin_detection.c109
-rw-r--r--media/libvpx/libvpx/vp8/common/vp8_skin_detection.h47
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c336
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm259
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/idct_blk_mmx.c23
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c84
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm296
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm710
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm123
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm817
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm1642
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c129
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm289
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm120
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm118
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm270
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm963
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm1515
-rw-r--r--media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c365
116 files changed, 37721 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp8/common/alloccommon.c b/media/libvpx/libvpx/vp8/common/alloccommon.c
new file mode 100644
index 0000000000..722b158c3a
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/alloccommon.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "alloccommon.h"
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxc_int.h"
+#include "findnearmv.h"
+#include "entropymode.h"
+#include "systemdependent.h"
+
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci) {
+ int i;
+ for (i = 0; i < NUM_YV12_BUFFERS; ++i) {
+ vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
+ }
+
+ vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
+#if CONFIG_POSTPROC
+ vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+ if (oci->post_proc_buffer_int_used) {
+ vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);
+ }
+
+ vpx_free(oci->pp_limits_buffer);
+ oci->pp_limits_buffer = NULL;
+
+ vpx_free(oci->postproc_state.generated_noise);
+ oci->postproc_state.generated_noise = NULL;
+#endif
+
+ vpx_free(oci->above_context);
+ vpx_free(oci->mip);
+#if CONFIG_ERROR_CONCEALMENT
+ vpx_free(oci->prev_mip);
+ oci->prev_mip = NULL;
+#endif
+
+ oci->above_context = NULL;
+ oci->mip = NULL;
+}
+
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height) {
+ int i;
+
+ vp8_de_alloc_frame_buffers(oci);
+
+ /* our internal buffers are always multiples of 16 */
+ if ((width & 0xf) != 0) width += 16 - (width & 0xf);
+
+ if ((height & 0xf) != 0) height += 16 - (height & 0xf);
+
+ for (i = 0; i < NUM_YV12_BUFFERS; ++i) {
+ oci->fb_idx_ref_cnt[i] = 0;
+ oci->yv12_fb[i].flags = 0;
+ if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height,
+ VP8BORDERINPIXELS) < 0) {
+ goto allocation_fail;
+ }
+ }
+
+ oci->new_fb_idx = 0;
+ oci->lst_fb_idx = 1;
+ oci->gld_fb_idx = 2;
+ oci->alt_fb_idx = 3;
+
+ oci->fb_idx_ref_cnt[0] = 1;
+ oci->fb_idx_ref_cnt[1] = 1;
+ oci->fb_idx_ref_cnt[2] = 1;
+ oci->fb_idx_ref_cnt[3] = 1;
+
+ if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16,
+ VP8BORDERINPIXELS) < 0) {
+ goto allocation_fail;
+ }
+
+ oci->mb_rows = height >> 4;
+ oci->mb_cols = width >> 4;
+ oci->MBs = oci->mb_rows * oci->mb_cols;
+ oci->mode_info_stride = oci->mb_cols + 1;
+ oci->mip =
+ vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+ if (!oci->mip) goto allocation_fail;
+
+ oci->mi = oci->mip + oci->mode_info_stride + 1;
+
+ /* Allocation of previous mode info will be done in vp8_decode_frame()
+ * as it is a decoder only data */
+
+ oci->above_context =
+ vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
+
+ if (!oci->above_context) goto allocation_fail;
+
+#if CONFIG_POSTPROC
+ if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height,
+ VP8BORDERINPIXELS) < 0) {
+ goto allocation_fail;
+ }
+
+ oci->post_proc_buffer_int_used = 0;
+ memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
+ memset(oci->post_proc_buffer.buffer_alloc, 128,
+ oci->post_proc_buffer.frame_size);
+
+ /* Allocate buffer to store post-processing filter coefficients.
+ *
+ * Note: Round up mb_cols to support SIMD reads
+ */
+ oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1));
+ if (!oci->pp_limits_buffer) goto allocation_fail;
+#endif
+
+ return 0;
+
+allocation_fail:
+ vp8_de_alloc_frame_buffers(oci);
+ return 1;
+}
+
+void vp8_setup_version(VP8_COMMON *cm) {
+ switch (cm->version) {
+ case 0:
+ cm->no_lpf = 0;
+ cm->filter_type = NORMAL_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 0;
+ cm->full_pixel = 0;
+ break;
+ case 1:
+ cm->no_lpf = 0;
+ cm->filter_type = SIMPLE_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 1;
+ cm->full_pixel = 0;
+ break;
+ case 2:
+ cm->no_lpf = 1;
+ cm->filter_type = NORMAL_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 1;
+ cm->full_pixel = 0;
+ break;
+ case 3:
+ cm->no_lpf = 1;
+ cm->filter_type = SIMPLE_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 1;
+ cm->full_pixel = 1;
+ break;
+ default:
+ /*4,5,6,7 are reserved for future use*/
+ cm->no_lpf = 0;
+ cm->filter_type = NORMAL_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 0;
+ cm->full_pixel = 0;
+ break;
+ }
+}
+void vp8_create_common(VP8_COMMON *oci) {
+ vp8_machine_specific_config(oci);
+
+ vp8_init_mbmode_probs(oci);
+ vp8_default_bmode_probs(oci->fc.bmode_prob);
+
+ oci->mb_no_coeff_skip = 1;
+ oci->no_lpf = 0;
+ oci->filter_type = NORMAL_LOOPFILTER;
+ oci->use_bilinear_mc_filter = 0;
+ oci->full_pixel = 0;
+ oci->multi_token_partition = ONE_PARTITION;
+ oci->clamp_type = RECON_CLAMP_REQUIRED;
+
+ /* Initialize reference frame sign bias structure to defaults */
+ memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+
+ /* Default disable buffer to buffer copying */
+ oci->copy_buffer_to_gf = 0;
+ oci->copy_buffer_to_arf = 0;
+}
+
+void vp8_remove_common(VP8_COMMON *oci) { vp8_de_alloc_frame_buffers(oci); }
diff --git a/media/libvpx/libvpx/vp8/common/alloccommon.h b/media/libvpx/libvpx/vp8/common/alloccommon.h
new file mode 100644
index 0000000000..2d376bbac3
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/alloccommon.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_ALLOCCOMMON_H_
+#define VPX_VP8_COMMON_ALLOCCOMMON_H_
+
+#include "onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_create_common(VP8_COMMON *oci);
+void vp8_remove_common(VP8_COMMON *oci);
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
+void vp8_setup_version(VP8_COMMON *cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_ALLOCCOMMON_H_
diff --git a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c
new file mode 100644
index 0000000000..48a1972048
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "vp8/common/arm/loopfilter_arm.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+
+/* NEON loopfilter functions */
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ unsigned char mblim = *lfi->mblim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+ vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim,
+ hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim,
+ hev_thr, v_ptr);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ unsigned char mblim = *lfi->mblim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim,
+ hev_thr, v_ptr);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ unsigned char blim = *lfi->blim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim,
+ lim, hev_thr);
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim,
+ lim, hev_thr);
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim,
+ lim, hev_thr);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride,
+ blim, lim, hev_thr,
+ v_ptr + 4 * uv_stride);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ unsigned char blim = *lfi->blim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim,
+ hev_thr);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim,
+ hev_thr, v_ptr + 4);
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h
new file mode 100644
index 0000000000..6cf660d228
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
+#define VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
+
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+ unsigned char blimit, unsigned char limit,
+ unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+ unsigned char blimit, unsigned char limit,
+ unsigned char thresh, unsigned char *v);
+
+loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+
+loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
+loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+
+#endif // VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
new file mode 100644
index 0000000000..590956dde1
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -0,0 +1,764 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 },
+ { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 },
+ { 32, 96 }, { 16, 112 } };
+
+static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
+ return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
+}
+
+void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ uint8x8_t e0, e1, e2;
+
+ if (xoffset == 0) { // skip_1stpass_filter
+ uint8x8_t a0, a1, a2, a3, a4;
+
+ a0 = load_and_shift(src_ptr);
+ src_ptr += src_pixels_per_line;
+ a1 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ a2 = load_and_shift(src_ptr);
+ src_ptr += src_pixels_per_line;
+ a3 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ a4 = vld1_u8(src_ptr);
+
+ e0 = vext_u8(a0, a1, 4);
+ e1 = vext_u8(a2, a3, 4);
+ e2 = a4;
+ } else {
+ uint8x8_t a0, a1, a2, a3, a4, b4;
+ uint8x16_t a01, a23;
+ uint8x16_t b01, b23;
+ uint32x2x2_t c0, c1, c2, c3;
+ uint16x8_t d0, d1, d2;
+ const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ const uint8x8_t filter1 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ a0 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ a1 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ a2 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ a3 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ a4 = vld1_u8(src_ptr);
+
+ a01 = vcombine_u8(a0, a1);
+ a23 = vcombine_u8(a2, a3);
+
+ b01 = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(a01), 8));
+ b23 = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(a23), 8));
+ b4 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(a4), 8));
+
+ c0 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a01)),
+ vreinterpret_u32_u8(vget_high_u8(a01)));
+ c1 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a23)),
+ vreinterpret_u32_u8(vget_high_u8(a23)));
+ c2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b01)),
+ vreinterpret_u32_u8(vget_high_u8(b01)));
+ c3 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b23)),
+ vreinterpret_u32_u8(vget_high_u8(b23)));
+
+ d0 = vmull_u8(vreinterpret_u8_u32(c0.val[0]), filter0);
+ d1 = vmull_u8(vreinterpret_u8_u32(c1.val[0]), filter0);
+ d2 = vmull_u8(a4, filter0);
+
+ d0 = vmlal_u8(d0, vreinterpret_u8_u32(c2.val[0]), filter1);
+ d1 = vmlal_u8(d1, vreinterpret_u8_u32(c3.val[0]), filter1);
+ d2 = vmlal_u8(d2, b4, filter1);
+
+ e0 = vqrshrn_n_u16(d0, 7);
+ e1 = vqrshrn_n_u16(d1, 7);
+ e2 = vqrshrn_n_u16(d2, 7);
+ }
+
+ // secondpass_filter
+ if (yoffset == 0) { // skip_2ndpass_filter
+ store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
+ } else {
+ uint8x8_t f0, f1;
+ const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ const uint8x8_t filter1 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ uint16x8_t b0 = vmull_u8(e0, filter0);
+ uint16x8_t b1 = vmull_u8(e1, filter0);
+
+ const uint8x8_t a0 = vext_u8(e0, e1, 4);
+ const uint8x8_t a1 = vext_u8(e1, e2, 4);
+
+ b0 = vmlal_u8(b0, a0, filter1);
+ b1 = vmlal_u8(b1, a1, filter1);
+
+ f0 = vqrshrn_n_u16(b0, 7);
+ f1 = vqrshrn_n_u16(b1, 7);
+
+ store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(f0, f1));
+ }
+}
+
+void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8;
+ uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16;
+ uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
+
+ if (xoffset == 0) { // skip_1stpass_filter
+ d22u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d23u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d24u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d25u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d26u8 = vld1_u8(src_ptr);
+ } else {
+ q1u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q5u8 = vld1q_u8(src_ptr);
+
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
+
+ d22u8 = vqrshrn_n_u16(q6u16, 7);
+ d23u8 = vqrshrn_n_u16(q7u16, 7);
+ d24u8 = vqrshrn_n_u16(q8u16, 7);
+ d25u8 = vqrshrn_n_u16(q9u16, 7);
+ d26u8 = vqrshrn_n_u16(q10u16, 7);
+ }
+
+ // secondpass_filter
+ if (yoffset == 0) { // skip_2ndpass_filter
+ vst1_u8((uint8_t *)dst_ptr, d22u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d23u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d24u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d25u8);
+ } else {
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q1u16 = vmull_u8(d22u8, d0u8);
+ q2u16 = vmull_u8(d23u8, d0u8);
+ q3u16 = vmull_u8(d24u8, d0u8);
+ q4u16 = vmull_u8(d25u8, d0u8);
+
+ q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
+ q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+
+ vst1_u8((uint8_t *)dst_ptr, d2u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d3u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d4u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d5u8);
+ }
+ return;
+}
+
+void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8;
+ uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16;
+ uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
+
+ if (xoffset == 0) { // skip_1stpass_filter
+ d22u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d23u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d24u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d25u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d26u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d27u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d28u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d29u8 = vld1_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ d30u8 = vld1_u8(src_ptr);
+ } else {
+ q1u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+
+ d22u8 = vqrshrn_n_u16(q6u16, 7);
+ d23u8 = vqrshrn_n_u16(q7u16, 7);
+ d24u8 = vqrshrn_n_u16(q8u16, 7);
+ d25u8 = vqrshrn_n_u16(q9u16, 7);
+
+ // first_pass filtering on the rest 5-line data
+ q1u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q2u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q3u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q4u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q5u8 = vld1q_u8(src_ptr);
+
+ q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+
+ d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+ d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+ d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+
+ q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
+
+ d26u8 = vqrshrn_n_u16(q6u16, 7);
+ d27u8 = vqrshrn_n_u16(q7u16, 7);
+ d28u8 = vqrshrn_n_u16(q8u16, 7);
+ d29u8 = vqrshrn_n_u16(q9u16, 7);
+ d30u8 = vqrshrn_n_u16(q10u16, 7);
+ }
+
+ // secondpass_filter
+ if (yoffset == 0) { // skip_2ndpass_filter
+ vst1_u8((uint8_t *)dst_ptr, d22u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d23u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d24u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d25u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d26u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d27u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d28u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d29u8);
+ } else {
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q1u16 = vmull_u8(d22u8, d0u8);
+ q2u16 = vmull_u8(d23u8, d0u8);
+ q3u16 = vmull_u8(d24u8, d0u8);
+ q4u16 = vmull_u8(d25u8, d0u8);
+ q5u16 = vmull_u8(d26u8, d0u8);
+ q6u16 = vmull_u8(d27u8, d0u8);
+ q7u16 = vmull_u8(d28u8, d0u8);
+ q8u16 = vmull_u8(d29u8, d0u8);
+
+ q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
+ q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
+ q5u16 = vmlal_u8(q5u16, d27u8, d1u8);
+ q6u16 = vmlal_u8(q6u16, d28u8, d1u8);
+ q7u16 = vmlal_u8(q7u16, d29u8, d1u8);
+ q8u16 = vmlal_u8(q8u16, d30u8, d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ vst1_u8((uint8_t *)dst_ptr, d2u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d3u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d4u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d5u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8((uint8_t *)dst_ptr, d9u8);
+ }
+ return;
+}
+
+void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ int i;
+ unsigned char tmp[272];
+ unsigned char *tmpp;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
+ uint8x8_t d19u8, d20u8, d21u8;
+ uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+ uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
+ uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
+
+ if (xoffset == 0) { // secondpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ q11u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q13u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q14u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q15u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)dst_ptr, q1u8);
+ dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q2u8);
+ dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q3u8);
+ dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q4u8);
+ dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ if (yoffset == 0) { // firstpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ for (i = 4; i > 0; i--) {
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 = vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)dst_ptr, q7u8);
+ dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q8u8);
+ dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q9u8);
+ dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q10u8);
+ dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ // First Pass: output_height lines x output_width columns (17x16)
+ tmpp = tmp;
+ for (i = 3; i > 0; i--) {
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 = vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)tmpp, q7u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q10u8);
+ tmpp += 16;
+ }
+
+ // First-pass filtering for rest 5 lines
+ d14u8 = vld1_u8(src_ptr);
+ d15u8 = vld1_u8(src_ptr + 8);
+ d16u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ q9u16 = vmull_u8(d2u8, d0u8);
+ q10u16 = vmull_u8(d3u8, d0u8);
+ q11u16 = vmull_u8(d5u8, d0u8);
+ q12u16 = vmull_u8(d6u8, d0u8);
+ q13u16 = vmull_u8(d8u8, d0u8);
+ q14u16 = vmull_u8(d9u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+
+ q9u16 = vmlal_u8(q9u16, d2u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+
+ q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
+
+ q1u16 = vmull_u8(d11u8, d0u8);
+ q2u16 = vmull_u8(d12u8, d0u8);
+ q3u16 = vmull_u8(d14u8, d0u8);
+ q4u16 = vmull_u8(d15u8, d0u8);
+
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+ d14u8 = vext_u8(d14u8, d15u8, 1);
+
+ q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
+
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+ d15u8 = vext_u8(d15u8, d16u8, 1);
+
+ q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
+
+ d10u8 = vqrshrn_n_u16(q9u16, 7);
+ d11u8 = vqrshrn_n_u16(q10u16, 7);
+ d12u8 = vqrshrn_n_u16(q11u16, 7);
+ d13u8 = vqrshrn_n_u16(q12u16, 7);
+ d14u8 = vqrshrn_n_u16(q13u16, 7);
+ d15u8 = vqrshrn_n_u16(q14u16, 7);
+ d16u8 = vqrshrn_n_u16(q1u16, 7);
+ d17u8 = vqrshrn_n_u16(q2u16, 7);
+ d18u8 = vqrshrn_n_u16(q3u16, 7);
+ d19u8 = vqrshrn_n_u16(q4u16, 7);
+
+ q5u8 = vcombine_u8(d10u8, d11u8);
+ q6u8 = vcombine_u8(d12u8, d13u8);
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+
+ vst1q_u8((uint8_t *)tmpp, q5u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q6u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q7u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8);
+
+ // secondpass_filter
+ d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+ tmpp = tmp;
+ q11u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q13u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q14u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q15u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)dst_ptr, q1u8);
+ dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q2u8);
+ dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q3u8);
+ dst_ptr += dst_pitch;
+ vst1q_u8((uint8_t *)dst_ptr, q4u8);
+ dst_ptr += dst_pitch;
+ }
+ return;
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c
new file mode 100644
index 0000000000..c89b47d628
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
+
+void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride) {
+ uint8x8_t vtmp;
+ int r;
+
+ for (r = 0; r < 4; ++r) {
+ vtmp = vld1_u8(src);
+ vst1_u8(dst, vtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride) {
+ uint8x8_t vtmp;
+ int r;
+
+ for (r = 0; r < 8; ++r) {
+ vtmp = vld1_u8(src);
+ vst1_u8(dst, vtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride) {
+ int r;
+ uint8x16_t qtmp;
+
+ for (r = 0; r < 16; ++r) {
+ qtmp = vld1q_u8(src);
+ vst1q_u8(dst, qtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
new file mode 100644
index 0000000000..d12c3a8392
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
+
+void vp8_dc_only_idct_add_neon(int16_t input_dc, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride) {
+ int i;
+ uint16_t a1 = ((input_dc + 4) >> 3);
+ uint32x2_t d2u32 = vdup_n_u32(0);
+ uint8x8_t d2u8;
+ uint16x8_t q1u16;
+ uint16x8_t qAdd;
+
+ qAdd = vdupq_n_u16(a1);
+
+ for (i = 0; i < 2; ++i) {
+ d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
+ pred_ptr += pred_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
+ pred_ptr += pred_stride;
+
+ q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
+ dst_ptr += dst_stride;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
+ dst_ptr += dst_stride;
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
new file mode 100644
index 0000000000..5445f2965a
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
+
+void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
+ int stride) {
+ unsigned char *dst0;
+ int32x2_t d14, d15;
+ int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+ int16x8_t q1, q2, q3, q4, q5, q6;
+ int16x8_t qEmpty = vdupq_n_s16(0);
+ int32x2x2_t d2tmp0, d2tmp1;
+ int16x4x2_t d2tmp2, d2tmp3;
+
+ d14 = d15 = vdup_n_s32(0);
+
+ // load input
+ q3 = vld1q_s16(input);
+ vst1q_s16(input, qEmpty);
+ input += 8;
+ q4 = vld1q_s16(input);
+ vst1q_s16(input, qEmpty);
+
+ // load dq
+ q5 = vld1q_s16(dq);
+ dq += 8;
+ q6 = vld1q_s16(dq);
+
+ // load src from dst
+ dst0 = dst;
+ d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
+ dst0 += stride;
+ d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
+ dst0 += stride;
+ d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
+ dst0 += stride;
+ d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
+
+ q1 = vreinterpretq_s16_u16(
+ vmulq_u16(vreinterpretq_u16_s16(q3), vreinterpretq_u16_s16(q5)));
+ q2 = vreinterpretq_s16_u16(
+ vmulq_u16(vreinterpretq_u16_s16(q4), vreinterpretq_u16_s16(q6)));
+
+ d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
+ d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
+
+ q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
+
+ q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+ q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+ q4 = vshrq_n_s16(q4, 1);
+
+ q4 = vqaddq_s16(q4, q2);
+
+ d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+ d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+ vreinterpret_s16_s32(d2tmp1.val[0]));
+ d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+ vreinterpret_s16_s32(d2tmp1.val[1]));
+
+ // loop 2
+ q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
+
+ q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+ q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
+ d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
+
+ q4 = vshrq_n_s16(q4, 1);
+
+ q4 = vqaddq_s16(q4, q2);
+
+ d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+ d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2 = vrshr_n_s16(d2, 3);
+ d3 = vrshr_n_s16(d3, 3);
+ d4 = vrshr_n_s16(d4, 3);
+ d5 = vrshr_n_s16(d5, 3);
+
+ d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+ vreinterpret_s16_s32(d2tmp1.val[0]));
+ d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+ vreinterpret_s16_s32(d2tmp1.val[1]));
+
+ q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
+ q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
+
+ q1 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q1), vreinterpret_u8_s32(d14)));
+ q2 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q2), vreinterpret_u8_s32(d15)));
+
+ d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
+ d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
+
+ dst0 = dst;
+ vst1_lane_s32((int32_t *)dst0, d14, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d14, 1);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d15, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d15, 1);
+ return;
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
new file mode 100644
index 0000000000..791aaea2ae
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
+ int16x8x2_t qQ, qDQC, qDQ;
+
+ qQ = vld2q_s16(d->qcoeff);
+ qDQC = vld2q_s16(DQC);
+
+ qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
+ qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
+
+ vst2q_s16(d->dqcoeff, qDQ);
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c
new file mode 100644
index 0000000000..5c26ce67a4
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
+
+static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst,
+ int stride) {
+ unsigned char *dst0;
+ int i, a0, a1;
+ int16x8x2_t q2Add;
+ int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0);
+ uint8x8_t d2u8, d4u8;
+ uint16x8_t q1u16, q2u16;
+
+ a0 = ((q[0] * dq) + 4) >> 3;
+ a1 = ((q[16] * dq) + 4) >> 3;
+ q[0] = q[16] = 0;
+ q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+ q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+ for (i = 0; i < 2; i++, dst += 4) {
+ dst0 = dst;
+ d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+ dst0 += stride;
+ d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+ dst0 += stride;
+ d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+ dst0 += stride;
+ d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+ q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+ vreinterpret_u8_s32(d2s32));
+ q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+ vreinterpret_u8_s32(d4s32));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+ d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+ d2s32 = vreinterpret_s32_u8(d2u8);
+ d4s32 = vreinterpret_s32_u8(d4u8);
+
+ dst0 = dst;
+ vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+ }
+}
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq,
+ unsigned char *dst, int stride) {
+ unsigned char *dst0, *dst1;
+ int32x2_t d28, d29, d30, d31;
+ int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+ int16x8_t qEmpty = vdupq_n_s16(0);
+ int32x4x2_t q2tmp0, q2tmp1;
+ int16x8x2_t q2tmp2, q2tmp3;
+ int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+ d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+ // load dq
+ q0 = vld1q_s16(dq);
+ dq += 8;
+ q1 = vld1q_s16(dq);
+
+ // load q
+ q2 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q3 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q4 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q5 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+
+ // load src from dst
+ dst0 = dst;
+ dst1 = dst + 4;
+ d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+ dst0 += stride;
+ d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+ dst1 += stride;
+ d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+ dst0 += stride;
+ d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+ dst1 += stride;
+
+ d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+ dst0 += stride;
+ d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+ dst1 += stride;
+ d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+ d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+ q2 = vmulq_s16(q2, q0);
+ q3 = vmulq_s16(q3, q1);
+ q4 = vmulq_s16(q4, q0);
+ q5 = vmulq_s16(q5, q1);
+
+ // vswp
+ dLow0 = vget_low_s16(q2);
+ dHigh0 = vget_high_s16(q2);
+ dLow1 = vget_low_s16(q4);
+ dHigh1 = vget_high_s16(q4);
+ q2 = vcombine_s16(dLow0, dLow1);
+ q4 = vcombine_s16(dHigh0, dHigh1);
+
+ dLow0 = vget_low_s16(q3);
+ dHigh0 = vget_high_s16(q3);
+ dLow1 = vget_low_s16(q5);
+ dHigh1 = vget_high_s16(q5);
+ q3 = vcombine_s16(dLow0, dLow1);
+ q5 = vcombine_s16(dHigh0, dHigh1);
+
+ q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+ q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+ q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+ q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+ q10 = vqaddq_s16(q2, q3);
+ q11 = vqsubq_s16(q2, q3);
+
+ q8 = vshrq_n_s16(q8, 1);
+ q9 = vshrq_n_s16(q9, 1);
+
+ q4 = vqaddq_s16(q4, q8);
+ q5 = vqaddq_s16(q5, q9);
+
+ q2 = vqsubq_s16(q6, q5);
+ q3 = vqaddq_s16(q7, q4);
+
+ q4 = vqaddq_s16(q10, q3);
+ q5 = vqaddq_s16(q11, q2);
+ q6 = vqsubq_s16(q11, q2);
+ q7 = vqsubq_s16(q10, q3);
+
+ q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+ q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+ q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+ vreinterpretq_s16_s32(q2tmp1.val[0]));
+ q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+ vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+ // loop 2
+ q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+ q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+ q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+ q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+ q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+ q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+ q10 = vshrq_n_s16(q10, 1);
+ q11 = vshrq_n_s16(q11, 1);
+
+ q10 = vqaddq_s16(q2tmp2.val[1], q10);
+ q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+ q8 = vqsubq_s16(q8, q11);
+ q9 = vqaddq_s16(q9, q10);
+
+ q4 = vqaddq_s16(q2, q9);
+ q5 = vqaddq_s16(q3, q8);
+ q6 = vqsubq_s16(q3, q8);
+ q7 = vqsubq_s16(q2, q9);
+
+ q4 = vrshrq_n_s16(q4, 3);
+ q5 = vrshrq_n_s16(q5, 3);
+ q6 = vrshrq_n_s16(q6, 3);
+ q7 = vrshrq_n_s16(q7, 3);
+
+ q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+ q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+ q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+ vreinterpretq_s16_s32(q2tmp1.val[0]));
+ q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+ vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+ q4 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28)));
+ q5 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29)));
+ q6 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30)));
+ q7 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31)));
+
+ d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+ d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+ d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+ d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+ dst0 = dst;
+ dst1 = dst + 4;
+ vst1_lane_s32((int32_t *)dst0, d28, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d28, 1);
+ dst1 += stride;
+ vst1_lane_s32((int32_t *)dst0, d29, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d29, 1);
+ dst1 += stride;
+
+ vst1_lane_s32((int32_t *)dst0, d30, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d30, 1);
+ dst1 += stride;
+ vst1_lane_s32((int32_t *)dst0, d31, 0);
+ vst1_lane_s32((int32_t *)dst1, d31, 1);
+}
+
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst,
+ int stride, char *eobs) {
+ int i;
+
+ for (i = 0; i < 4; ++i) {
+ if (((short *)(eobs))[0]) {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon(q, dq, dst, stride);
+ else
+ idct_dequant_0_2x_neon(q, dq[0], dst, stride);
+ }
+
+ if (((short *)(eobs))[1]) {
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon(q + 32, dq, dst + 8, stride);
+ else
+ idct_dequant_0_2x_neon(q + 32, dq[0], dst + 8, stride);
+ }
+ q += 64;
+ dst += 4 * stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
+ unsigned char *dst_u,
+ unsigned char *dst_v, int stride,
+ char *eobs) {
+ if (((short *)(eobs))[0]) {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon(q, dq, dst_u, stride);
+ else
+ idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
+ }
+
+ q += 32;
+ dst_u += 4 * stride;
+
+ if (((short *)(eobs))[1]) {
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon(q, dq, dst_u, stride);
+ else
+ idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
+ }
+
+ q += 32;
+
+ if (((short *)(eobs))[2]) {
+ if (((short *)eobs)[2] & 0xfefe)
+ idct_dequant_full_2x_neon(q, dq, dst_v, stride);
+ else
+ idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
+ }
+
+ q += 32;
+ dst_v += 4 * stride;
+
+ if (((short *)(eobs))[3]) {
+ if (((short *)eobs)[3] & 0xfefe)
+ idct_dequant_full_2x_neon(q, dq, dst_v, stride);
+ else
+ idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c
new file mode 100644
index 0000000000..91600bfc00
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
+
+void vp8_short_inv_walsh4x4_neon(int16_t *input, int16_t *mb_dqcoeff) {
+ int16x8_t q0s16, q1s16, q2s16, q3s16;
+ int16x4_t d4s16, d5s16, d6s16, d7s16;
+ int16x4x2_t v2tmp0, v2tmp1;
+ int32x2x2_t v2tmp2, v2tmp3;
+ int16x8_t qAdd3;
+
+ q0s16 = vld1q_s16(input);
+ q1s16 = vld1q_s16(input + 8);
+
+ // 1st for loop
+ d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+ d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+ d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+ d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ q0s16 = vaddq_s16(q2s16, q3s16);
+ q1s16 = vsubq_s16(q2s16, q3s16);
+
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
+ vreinterpret_s32_s16(vget_low_s16(q1s16)));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
+ vreinterpret_s32_s16(vget_high_s16(q1s16)));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
+ vreinterpret_s16_s32(v2tmp3.val[0]));
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
+ vreinterpret_s16_s32(v2tmp3.val[1]));
+
+ // 2nd for loop
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ qAdd3 = vdupq_n_s16(3);
+
+ q0s16 = vaddq_s16(q2s16, q3s16);
+ q1s16 = vsubq_s16(q2s16, q3s16);
+
+ q0s16 = vaddq_s16(q0s16, qAdd3);
+ q1s16 = vaddq_s16(q1s16, qAdd3);
+
+ q0s16 = vshrq_n_s16(q0s16, 3);
+ q1s16 = vshrq_n_s16(q1s16, 3);
+
+ // store
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
+ mb_dqcoeff += 16;
+
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
+ mb_dqcoeff += 16;
+ vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
+ mb_dqcoeff += 16;
+ return;
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
new file mode 100644
index 0000000000..df983b23a3
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+
+static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
+ unsigned char *s, int p, const unsigned char *blimit) {
+ uint8_t *sp;
+ uint8x16_t qblimit, q0u8;
+ uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
+ int16x8_t q2s16, q3s16, q13s16;
+ int8x8_t d8s8, d9s8;
+ int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
+
+ qblimit = vdupq_n_u8(*blimit);
+
+ sp = s - (p << 1);
+ q5u8 = vld1q_u8(sp);
+ sp += p;
+ q6u8 = vld1q_u8(sp);
+ sp += p;
+ q7u8 = vld1q_u8(sp);
+ sp += p;
+ q8u8 = vld1q_u8(sp);
+
+ q15u8 = vabdq_u8(q6u8, q7u8);
+ q14u8 = vabdq_u8(q5u8, q8u8);
+
+ q15u8 = vqaddq_u8(q15u8, q15u8);
+ q14u8 = vshrq_n_u8(q14u8, 1);
+ q0u8 = vdupq_n_u8(0x80);
+ q13s16 = vdupq_n_s16(3);
+ q15u8 = vqaddq_u8(q15u8, q14u8);
+
+ q5u8 = veorq_u8(q5u8, q0u8);
+ q6u8 = veorq_u8(q6u8, q0u8);
+ q7u8 = veorq_u8(q7u8, q0u8);
+ q8u8 = veorq_u8(q8u8, q0u8);
+
+ q15u8 = vcgeq_u8(qblimit, q15u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
+ vget_low_s8(vreinterpretq_s8_u8(q6u8)));
+ q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
+ vget_high_s8(vreinterpretq_s8_u8(q6u8)));
+
+ q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), vreinterpretq_s8_u8(q8u8));
+
+ q2s16 = vmulq_s16(q2s16, q13s16);
+ q3s16 = vmulq_s16(q3s16, q13s16);
+
+ q10u8 = vdupq_n_u8(3);
+ q9u8 = vdupq_n_u8(4);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
+ q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
+
+ d8s8 = vqmovn_s16(q2s16);
+ d9s8 = vqmovn_s16(q3s16);
+ q4s8 = vcombine_s8(d8s8, d9s8);
+
+ q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
+
+ q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
+ q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q3s8 = vshrq_n_s8(q3s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
+
+ q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+ vst1q_u8(s, q7u8);
+ s -= p;
+ vst1q_u8(s, q6u8);
+ return;
+}
+
+void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += y_stride * 4;
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
+
+void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
new file mode 100644
index 0000000000..fbc83ae290
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "vpx_ports/arm.h"
+
+#ifdef VPX_INCOMPATIBLE_GCC
+static INLINE void write_2x4(unsigned char *dst, int pitch,
+ const uint8x8x2_t result) {
+ /*
+ * uint8x8x2_t result
+ 00 01 02 03 | 04 05 06 07
+ 10 11 12 13 | 14 15 16 17
+ ---
+ * after vtrn_u8
+ 00 10 02 12 | 04 14 06 16
+ 01 11 03 13 | 05 15 07 17
+ */
+ const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0], result.val[1]);
+ const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
+ const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
+}
+
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+ const uint8x8x2_t result,
+ const uint8x8x2_t result2) {
+ write_2x4(dst, pitch, result);
+ dst += pitch * 8;
+ write_2x4(dst, pitch, result2);
+}
+#else
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+ const uint8x8x2_t result,
+ const uint8x8x2_t result2) {
+ vst2_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 7);
+ dst += pitch;
+
+ vst2_lane_u8(dst, result2, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 7);
+}
+#endif // VPX_INCOMPATIBLE_GCC
+
+#ifdef VPX_INCOMPATIBLE_GCC
+static INLINE uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
+ uint8x8x4_t x;
+ const uint8x8_t a = vld1_u8(src);
+ const uint8x8_t b = vld1_u8(src + pitch * 1);
+ const uint8x8_t c = vld1_u8(src + pitch * 2);
+ const uint8x8_t d = vld1_u8(src + pitch * 3);
+ const uint8x8_t e = vld1_u8(src + pitch * 4);
+ const uint8x8_t f = vld1_u8(src + pitch * 5);
+ const uint8x8_t g = vld1_u8(src + pitch * 6);
+ const uint8x8_t h = vld1_u8(src + pitch * 7);
+ const uint32x2x2_t r04_u32 =
+ vtrn_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(e));
+ const uint32x2x2_t r15_u32 =
+ vtrn_u32(vreinterpret_u32_u8(b), vreinterpret_u32_u8(f));
+ const uint32x2x2_t r26_u32 =
+ vtrn_u32(vreinterpret_u32_u8(c), vreinterpret_u32_u8(g));
+ const uint32x2x2_t r37_u32 =
+ vtrn_u32(vreinterpret_u32_u8(d), vreinterpret_u32_u8(h));
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
+ vreinterpret_u16_u32(r26_u32.val[0]));
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
+ vreinterpret_u16_u32(r37_u32.val[0]));
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+ vreinterpret_u8_u16(r13_u16.val[0]));
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+ vreinterpret_u8_u16(r13_u16.val[1]));
+ /*
+ * after vtrn_u32
+ 00 01 02 03 | 40 41 42 43
+ 10 11 12 13 | 50 51 52 53
+ 20 21 22 23 | 60 61 62 63
+ 30 31 32 33 | 70 71 72 73
+ ---
+ * after vtrn_u16
+ 00 01 20 21 | 40 41 60 61
+ 02 03 22 23 | 42 43 62 63
+ 10 11 30 31 | 50 51 70 71
+ 12 13 32 33 | 52 52 72 73
+
+ 00 01 20 21 | 40 41 60 61
+ 10 11 30 31 | 50 51 70 71
+ 02 03 22 23 | 42 43 62 63
+ 12 13 32 33 | 52 52 72 73
+ ---
+ * after vtrn_u8
+ 00 10 20 30 | 40 50 60 70
+ 01 11 21 31 | 41 51 61 71
+ 02 12 22 32 | 42 52 62 72
+ 03 13 23 33 | 43 53 63 73
+ */
+ x.val[0] = r01_u8.val[0];
+ x.val[1] = r01_u8.val[1];
+ x.val[2] = r23_u8.val[0];
+ x.val[3] = r23_u8.val[1];
+
+ return x;
+}
+#else
+static INLINE uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
+ uint8x8x4_t x;
+ x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0);
+ x = vld4_lane_u8(src, x, 0);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 1);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 2);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 3);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 4);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 5);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 6);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 7);
+ return x;
+}
+#endif // VPX_INCOMPATIBLE_GCC
+
+static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
+ unsigned char *s, int p, const unsigned char *blimit) {
+ unsigned char *src1;
+ uint8x16_t qblimit, q0u8;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
+ int16x8_t q2s16, q13s16, q11s16;
+ int8x8_t d28s8, d29s8;
+ int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
+ uint8x8x4_t d0u8x4; // d6, d7, d8, d9
+ uint8x8x4_t d1u8x4; // d10, d11, d12, d13
+ uint8x8x2_t d2u8x2; // d12, d13
+ uint8x8x2_t d3u8x2; // d14, d15
+
+ qblimit = vdupq_n_u8(*blimit);
+
+ src1 = s - 2;
+ d0u8x4 = read_4x8(src1, p);
+ src1 += p * 8;
+ d1u8x4 = read_4x8(src1, p);
+
+ q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10
+ q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12
+ q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11
+ q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13
+
+ q15u8 = vabdq_u8(q5u8, q4u8);
+ q14u8 = vabdq_u8(q3u8, q6u8);
+
+ q15u8 = vqaddq_u8(q15u8, q15u8);
+ q14u8 = vshrq_n_u8(q14u8, 1);
+ q0u8 = vdupq_n_u8(0x80);
+ q11s16 = vdupq_n_s16(3);
+ q15u8 = vqaddq_u8(q15u8, q14u8);
+
+ q3u8 = veorq_u8(q3u8, q0u8);
+ q4u8 = veorq_u8(q4u8, q0u8);
+ q5u8 = veorq_u8(q5u8, q0u8);
+ q6u8 = veorq_u8(q6u8, q0u8);
+
+ q15u8 = vcgeq_u8(qblimit, q15u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
+ vget_low_s8(vreinterpretq_s8_u8(q5u8)));
+ q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
+ vget_high_s8(vreinterpretq_s8_u8(q5u8)));
+
+ q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8), vreinterpretq_s8_u8(q6u8));
+
+ q2s16 = vmulq_s16(q2s16, q11s16);
+ q13s16 = vmulq_s16(q13s16, q11s16);
+
+ q11u8 = vdupq_n_u8(3);
+ q12u8 = vdupq_n_u8(4);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
+ q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
+
+ d28s8 = vqmovn_s16(q2s16);
+ d29s8 = vqmovn_s16(q13s16);
+ q14s8 = vcombine_s8(d28s8, d29s8);
+
+ q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
+
+ q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
+ q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q14s8 = vshrq_n_s8(q3s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
+
+ q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+ d2u8x2.val[0] = vget_low_u8(q6u8); // d12
+ d2u8x2.val[1] = vget_low_u8(q7u8); // d14
+ d3u8x2.val[0] = vget_high_u8(q6u8); // d13
+ d3u8x2.val[1] = vget_high_u8(q7u8); // d15
+
+ src1 = s - 1;
+ write_2x8(src1, p, d2u8x2, d3u8x2);
+}
+
+void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
+
+void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
new file mode 100644
index 0000000000..fafaf2d451
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -0,0 +1,613 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vp8/common/arm/loopfilter_arm.h"
+
+static INLINE void vp8_mbloop_filter_neon(uint8x16_t qblimit, // mblimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p2
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q4r, // p1
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r, // q1
+ uint8x16_t *q9r) { // q1
+ uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
+ uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
+ int8x16_t q0s8, q12s8, q14s8, q15s8;
+ int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q1u8 = vabdq_u8(q9, q8);
+ q0u8 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q1u8 = vmaxq_u8(q1u8, q0u8);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q12u8 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q1u8);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ q1u8 = vabdq_u8(q5, q8);
+ q12u8 = vqaddq_u8(q12u8, q12u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q0u8 = vdupq_n_u8(0x80);
+ q9 = veorq_u8(q9, q0u8);
+ q8 = veorq_u8(q8, q0u8);
+ q7 = veorq_u8(q7, q0u8);
+ q6 = veorq_u8(q6, q0u8);
+ q5 = veorq_u8(q5, q0u8);
+ q4 = veorq_u8(q4, q0u8);
+
+ q1u8 = vshrq_n_u8(q1u8, 1);
+ q12u8 = vqaddq_u8(q12u8, q1u8);
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+ q12u8 = vcgeq_u8(qblimit, q12u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
+
+ q11s16 = vdupq_n_s16(3);
+ q2s16 = vmulq_s16(q2s16, q11s16);
+ q13s16 = vmulq_s16(q13s16, q11s16);
+
+ q15u8 = vandq_u8(q15u8, q12u8);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
+
+ q12u8 = vdupq_n_u8(3);
+ q11u8 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2 = vqmovn_s16(q2s16);
+ d3 = vqmovn_s16(q13s16);
+ q1s8 = vcombine_s8(d2, d3);
+ q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
+ q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
+ q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q13s8 = vshrq_n_s8(q13s8, 3);
+
+ q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
+ q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
+
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
+ d5 = vdup_n_s8(9);
+ d4 = vdup_n_s8(18);
+
+ q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5);
+ q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
+ d5 = vdup_n_s8(27);
+ q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4);
+ q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
+ q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5);
+ q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
+
+ d0 = vqshrn_n_s16(q0s16, 7);
+ d1 = vqshrn_n_s16(q11s16, 7);
+ d24 = vqshrn_n_s16(q12s16, 7);
+ d25 = vqshrn_n_s16(q13s16, 7);
+ d28 = vqshrn_n_s16(q14s16, 7);
+ d29 = vqshrn_n_s16(q15s16, 7);
+
+ q0s8 = vcombine_s8(d0, d1);
+ q12s8 = vcombine_s8(d24, d25);
+ q14s8 = vcombine_s8(d28, d29);
+
+ q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
+ q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
+ q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
+ q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
+ q15s8 = vqsubq_s8((q7s8), q14s8);
+ q14s8 = vqaddq_s8((q6s8), q14s8);
+
+ q1u8 = vdupq_n_u8(0x80);
+ *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
+ *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
+ return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ src -= (pitch << 2);
+
+ q3 = vld1q_u8(src);
+ src += pitch;
+ q4 = vld1q_u8(src);
+ src += pitch;
+ q5 = vld1q_u8(src);
+ src += pitch;
+ q6 = vld1q_u8(src);
+ src += pitch;
+ q7 = vld1q_u8(src);
+ src += pitch;
+ q8 = vld1q_u8(src);
+ src += pitch;
+ q9 = vld1q_u8(src);
+ src += pitch;
+ q10 = vld1q_u8(src);
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
+ q10, &q4, &q5, &q6, &q7, &q8, &q9);
+
+ src -= (pitch * 6);
+ vst1q_u8(src, q4);
+ src += pitch;
+ vst1q_u8(src, q5);
+ src += pitch;
+ vst1q_u8(src, q6);
+ src += pitch;
+ vst1q_u8(src, q7);
+ src += pitch;
+ vst1q_u8(src, q8);
+ src += pitch;
+ vst1q_u8(src, q9);
+ return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ u -= (pitch << 2);
+ v -= (pitch << 2);
+
+ d6 = vld1_u8(u);
+ u += pitch;
+ d7 = vld1_u8(v);
+ v += pitch;
+ d8 = vld1_u8(u);
+ u += pitch;
+ d9 = vld1_u8(v);
+ v += pitch;
+ d10 = vld1_u8(u);
+ u += pitch;
+ d11 = vld1_u8(v);
+ v += pitch;
+ d12 = vld1_u8(u);
+ u += pitch;
+ d13 = vld1_u8(v);
+ v += pitch;
+ d14 = vld1_u8(u);
+ u += pitch;
+ d15 = vld1_u8(v);
+ v += pitch;
+ d16 = vld1_u8(u);
+ u += pitch;
+ d17 = vld1_u8(v);
+ v += pitch;
+ d18 = vld1_u8(u);
+ u += pitch;
+ d19 = vld1_u8(v);
+ v += pitch;
+ d20 = vld1_u8(u);
+ d21 = vld1_u8(v);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
+ q10, &q4, &q5, &q6, &q7, &q8, &q9);
+
+ u -= (pitch * 6);
+ v -= (pitch * 6);
+ vst1_u8(u, vget_low_u8(q4));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q4));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q5));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q5));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q6));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q6));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q7));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q7));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q8));
+ u += pitch;
+ vst1_u8(v, vget_high_u8(q8));
+ v += pitch;
+ vst1_u8(u, vget_low_u8(q9));
+ vst1_u8(v, vget_high_u8(q9));
+ return;
+}
+
+void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ unsigned char *s1, *s2;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ s1 = src - 4;
+ s2 = s1 + 8 * pitch;
+ d6 = vld1_u8(s1);
+ s1 += pitch;
+ d7 = vld1_u8(s2);
+ s2 += pitch;
+ d8 = vld1_u8(s1);
+ s1 += pitch;
+ d9 = vld1_u8(s2);
+ s2 += pitch;
+ d10 = vld1_u8(s1);
+ s1 += pitch;
+ d11 = vld1_u8(s2);
+ s2 += pitch;
+ d12 = vld1_u8(s1);
+ s1 += pitch;
+ d13 = vld1_u8(s2);
+ s2 += pitch;
+ d14 = vld1_u8(s1);
+ s1 += pitch;
+ d15 = vld1_u8(s2);
+ s2 += pitch;
+ d16 = vld1_u8(s1);
+ s1 += pitch;
+ d17 = vld1_u8(s2);
+ s2 += pitch;
+ d18 = vld1_u8(s1);
+ s1 += pitch;
+ d19 = vld1_u8(s2);
+ s2 += pitch;
+ d20 = vld1_u8(s1);
+ d21 = vld1_u8(s2);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
+ q10, &q4, &q5, &q6, &q7, &q8, &q9);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ s1 -= 7 * pitch;
+ s2 -= 7 * pitch;
+
+ vst1_u8(s1, vget_low_u8(q3));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q3));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q4));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q4));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q5));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q5));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q6));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q6));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q7));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q7));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q8));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q8));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q9));
+ s1 += pitch;
+ vst1_u8(s2, vget_high_u8(q9));
+ s2 += pitch;
+ vst1_u8(s1, vget_low_u8(q10));
+ vst1_u8(s2, vget_high_u8(q10));
+ return;
+}
+
+void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ unsigned char *us, *ud;
+ unsigned char *vs, *vd;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ us = u - 4;
+ vs = v - 4;
+ d6 = vld1_u8(us);
+ us += pitch;
+ d7 = vld1_u8(vs);
+ vs += pitch;
+ d8 = vld1_u8(us);
+ us += pitch;
+ d9 = vld1_u8(vs);
+ vs += pitch;
+ d10 = vld1_u8(us);
+ us += pitch;
+ d11 = vld1_u8(vs);
+ vs += pitch;
+ d12 = vld1_u8(us);
+ us += pitch;
+ d13 = vld1_u8(vs);
+ vs += pitch;
+ d14 = vld1_u8(us);
+ us += pitch;
+ d15 = vld1_u8(vs);
+ vs += pitch;
+ d16 = vld1_u8(us);
+ us += pitch;
+ d17 = vld1_u8(vs);
+ vs += pitch;
+ d18 = vld1_u8(us);
+ us += pitch;
+ d19 = vld1_u8(vs);
+ vs += pitch;
+ d20 = vld1_u8(us);
+ d21 = vld1_u8(vs);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
+ q10, &q4, &q5, &q6, &q7, &q8, &q9);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ ud = u - 4;
+ vst1_u8(ud, vget_low_u8(q3));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q4));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q5));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q6));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q7));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q8));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q9));
+ ud += pitch;
+ vst1_u8(ud, vget_low_u8(q10));
+
+ vd = v - 4;
+ vst1_u8(vd, vget_high_u8(q3));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q4));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q5));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q6));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q7));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q8));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q9));
+ vd += pitch;
+ vst1_u8(vd, vget_high_u8(q10));
+ return;
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
new file mode 100644
index 0000000000..2724ca236b
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
+
+void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride) {
+ int i;
+ uint32x2_t d6u32 = vdup_n_u32(0);
+ uint8x8_t d1u8;
+ int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+ uint16x8_t q1u16;
+ int16x8_t q1s16, q2s16, q3s16, q4s16;
+ int32x2x2_t v2tmp0, v2tmp1;
+ int16x4x2_t v2tmp2, v2tmp3;
+
+ d2 = vld1_s16(input);
+ d3 = vld1_s16(input + 4);
+ d4 = vld1_s16(input + 8);
+ d5 = vld1_s16(input + 12);
+
+ // 1st for loop
+ q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here
+ q2s16 = vcombine_s16(d3, d5);
+
+ q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+ q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
+ d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
+
+ q4s16 = vshrq_n_s16(q4s16, 1);
+
+ q4s16 = vqaddq_s16(q4s16, q2s16);
+
+ d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
+ d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+ vreinterpret_s16_s32(v2tmp1.val[0]));
+ v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+ vreinterpret_s16_s32(v2tmp1.val[1]));
+
+ // 2nd for loop
+ q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
+ q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
+
+ q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+ q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+ d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
+ d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
+
+ q4s16 = vshrq_n_s16(q4s16, 1);
+
+ q4s16 = vqaddq_s16(q4s16, q2s16);
+
+ d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
+ d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
+
+ d2 = vqadd_s16(d12, d11);
+ d3 = vqadd_s16(d13, d10);
+ d4 = vqsub_s16(d13, d10);
+ d5 = vqsub_s16(d12, d11);
+
+ d2 = vrshr_n_s16(d2, 3);
+ d3 = vrshr_n_s16(d3, 3);
+ d4 = vrshr_n_s16(d4, 3);
+ d5 = vrshr_n_s16(d5, 3);
+
+ v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+ v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+ v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+ vreinterpret_s16_s32(v2tmp1.val[0]));
+ v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+ vreinterpret_s16_s32(v2tmp1.val[1]));
+
+ q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
+ q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
+
+ // dc_only_idct_add
+ for (i = 0; i < 2; i++, q1s16 = q2s16) {
+ d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
+ pred_ptr += pred_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
+ pred_ptr += pred_stride;
+
+ q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), vreinterpret_u8_u32(d6u32));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
+ dst_ptr += dst_stride;
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
+ dst_ptr += dst_stride;
+ }
+ return;
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
new file mode 100644
index 0000000000..ee3c281f0f
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -0,0 +1,1729 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <string.h>
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_ports/mem.h"
+
+static const int8_t vp8_sub_pel_filters[8][8] = {
+ { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */
+ { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */
+ { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
+ { 0, -9, 93, 50, -6, 0, 0, 0 },
+ { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
+ { 0, -6, 50, 93, -9, 0, 0, 0 },
+ { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
+ { 0, -1, 12, 123, -6, 0, 0, 0 },
+};
+
+// This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
+// Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
+// Elements 1 and 4 are either 0 or negative. The code accounts for this with
+// multiply/accumulates which either add or subtract as needed. The other
+// functions will be updated to use this table later.
+// It is also expanded to 8 elements to allow loading into 64 bit neon
+// registers.
+static const uint8_t abs_filters[8][8] = {
+ { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 },
+ { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
+ { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
+ { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
+};
+
+static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
+ return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
+}
+
+static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
+ const uint8x8_t filter, uint16x8_t *c,
+ uint16x8_t *d) {
+ const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
+ vreinterpret_u32_u8(vget_high_u8(a)));
+ const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
+ vreinterpret_u32_u8(vget_high_u8(b)));
+ *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
+ *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
+}
+
+static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
+ const uint8x8_t filter, uint16x8_t *c,
+ uint16x8_t *d) {
+ const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
+ vreinterpret_u32_u8(vget_high_u8(a)));
+ const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
+ vreinterpret_u32_u8(vget_high_u8(b)));
+ *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
+ *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
+}
+
+static INLINE void yonly4x4(const unsigned char *src, int src_stride,
+ int filter_offset, unsigned char *dst,
+ int dst_stride) {
+ uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
+ uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
+ uint16x8_t c0, c1, c2, c3;
+ int16x8_t d0, d1;
+ uint8x8_t e0, e1;
+
+ const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
+ const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
+ const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
+ const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
+ const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
+ const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
+ const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
+
+ src -= src_stride * 2;
+ // Shift the even rows to allow using 'vext' to combine the vectors. armv8
+ // has vcopy_lane which would be interesting. This started as just a
+ // horrible workaround for clang adding alignment hints to 32bit loads:
+ // https://llvm.org/bugs/show_bug.cgi?id=24421
+ // But it turns out it almost identical to casting the loads.
+ a0 = load_and_shift(src);
+ src += src_stride;
+ a1 = vld1_u8(src);
+ src += src_stride;
+ a2 = load_and_shift(src);
+ src += src_stride;
+ a3 = vld1_u8(src);
+ src += src_stride;
+ a4 = load_and_shift(src);
+ src += src_stride;
+ a5 = vld1_u8(src);
+ src += src_stride;
+ a6 = load_and_shift(src);
+ src += src_stride;
+ a7 = vld1_u8(src);
+ src += src_stride;
+ a8 = vld1_u8(src);
+
+ // Combine the rows so we can operate on 8 at a time.
+ b0 = vext_u8(a0, a1, 4);
+ b2 = vext_u8(a2, a3, 4);
+ b4 = vext_u8(a4, a5, 4);
+ b6 = vext_u8(a6, a7, 4);
+ b8 = a8;
+
+ // To keep with the 8-at-a-time theme, combine *alternate* rows. This
+ // allows combining the odd rows with the even.
+ b1 = vext_u8(b0, b2, 4);
+ b3 = vext_u8(b2, b4, 4);
+ b5 = vext_u8(b4, b6, 4);
+ b7 = vext_u8(b6, b8, 4);
+
+ // Multiply and expand to 16 bits.
+ c0 = vmull_u8(b0, filter0);
+ c1 = vmull_u8(b2, filter0);
+ c2 = vmull_u8(b5, filter5);
+ c3 = vmull_u8(b7, filter5);
+
+ // Multiply, subtract and accumulate for filters 1 and 4 (the negative
+ // ones).
+ c0 = vmlsl_u8(c0, b4, filter4);
+ c1 = vmlsl_u8(c1, b6, filter4);
+ c2 = vmlsl_u8(c2, b1, filter1);
+ c3 = vmlsl_u8(c3, b3, filter1);
+
+ // Add more positive ones. vmlal should really return a signed type.
+ // It's doing signed math internally, as evidenced by the fact we can do
+ // subtractions followed by more additions. Ideally we could use
+ // vqmlal/sl but that instruction doesn't exist. Might be able to
+ // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
+ c0 = vmlal_u8(c0, b2, filter2);
+ c1 = vmlal_u8(c1, b4, filter2);
+ c2 = vmlal_u8(c2, b3, filter3);
+ c3 = vmlal_u8(c3, b5, filter3);
+
+ // Use signed saturation math because vmlsl may have left some negative
+ // numbers in there.
+ d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
+ d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
+
+ // Use signed again because numbers like -200 need to be saturated to 0.
+ e0 = vqrshrun_n_s16(d0, 7);
+ e1 = vqrshrun_n_s16(d1, 7);
+
+ store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
+}
+
+void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset,
+ unsigned char *dst_ptr, int dst_pitch) {
+ uint8x16_t s0, s1, s2, s3, s4;
+ uint64x2_t s01, s23;
+ // Variables to hold src[] elements for the given filter[]
+ uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
+ uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
+ uint8x16_t s01_f0, s23_f0;
+ uint64x2_t s01_f3, s23_f3;
+ uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
+ // Accumulator variables.
+ uint16x8_t d0123, d4567, d89;
+ uint16x8_t d0123_a, d4567_a, d89_a;
+ int16x8_t e0123, e4567, e89;
+ // Second pass intermediates.
+ uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
+ uint16x8_t c0, c1, c2, c3;
+ int16x8_t d0, d1;
+ uint8x8_t e0, e1;
+ uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
+
+ if (xoffset == 0) { // Second pass only.
+ yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
+ return;
+ }
+
+ if (yoffset == 0) { // First pass only.
+ src_ptr -= 2;
+ } else { // Add context for the second pass. 2 extra lines on top.
+ src_ptr -= 2 + (src_pixels_per_line * 2);
+ }
+
+ filter = vld1_u8(abs_filters[xoffset]);
+ filter0 = vdup_lane_u8(filter, 0);
+ filter1 = vdup_lane_u8(filter, 1);
+ filter2 = vdup_lane_u8(filter, 2);
+ filter3 = vdup_lane_u8(filter, 3);
+ filter4 = vdup_lane_u8(filter, 4);
+ filter5 = vdup_lane_u8(filter, 5);
+
+ // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
+ // garbage. So much effort for that last single bit.
+ // The low values of each pair are for filter0.
+ s0 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s1 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s2 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s3 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+
+ // Shift to extract values for filter[5]
+ // If src[] is 0, this puts:
+ // 3 4 5 6 7 8 9 10 in s0_f5
+ // Can't use vshr.u64 because it crosses the double word boundary.
+ s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
+ s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
+ s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
+ s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
+
+ s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
+ s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
+
+ s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
+ s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
+ d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
+ d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
+
+ // Keep original src data as 64 bits to simplify shifting and extracting.
+ s01 = vreinterpretq_u64_u8(s01_f0);
+ s23 = vreinterpretq_u64_u8(s23_f0);
+
+ // 3 4 5 6 * filter0
+ filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
+
+ // Shift over one to use -1, 0, 1, 2 for filter1
+ // -1 0 1 2 * filter1
+ filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
+ &d0123, &d4567);
+
+ // 2 3 4 5 * filter4
+ filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
+ &d0123, &d4567);
+
+ // 0 1 2 3 * filter2
+ filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
+ &d0123, &d4567);
+
+ // 1 2 3 4 * filter3
+ s01_f3 = vshrq_n_u64(s01, 24);
+ s23_f3 = vshrq_n_u64(s23, 24);
+ s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
+ vreinterpret_u32_u64(vget_high_u64(s01_f3)));
+ s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
+ vreinterpret_u32_u64(vget_high_u64(s23_f3)));
+ // Accumulate into different registers so it can use saturated addition.
+ d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
+ d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
+
+ e0123 =
+ vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
+ e4567 =
+ vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
+
+ // Shift and narrow.
+ b0 = vqrshrun_n_s16(e0123, 7);
+ b2 = vqrshrun_n_s16(e4567, 7);
+
+ if (yoffset == 0) { // firstpass_filter4x4_only
+ store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
+ return;
+ }
+
+ // Load additional context when doing both filters.
+ s0 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s1 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s2 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s3 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ s4 = vld1q_u8(src_ptr);
+
+ s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
+ s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
+ s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
+ s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
+ s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
+
+ // 3 4 5 6 * filter0
+ s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
+ s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
+
+ s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
+ s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
+ // But this time instead of 16 pixels to filter, there are 20. So an extra
+ // run with a doubleword register.
+ d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
+ d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
+ d89 = vmull_u8(s4_f5, filter5);
+
+ // Save a copy as u64 for shifting.
+ s01 = vreinterpretq_u64_u8(s01_f0);
+ s23 = vreinterpretq_u64_u8(s23_f0);
+
+ filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
+ d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
+
+ filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
+ &d0123, &d4567);
+ s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
+ d89 = vmlsl_u8(d89, s4_f1, filter1);
+
+ filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
+ &d0123, &d4567);
+ s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
+ d89 = vmlsl_u8(d89, s4_f4, filter4);
+
+ filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
+ vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
+ &d0123, &d4567);
+ s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
+ d89 = vmlal_u8(d89, s4_f2, filter2);
+
+ s01_f3 = vshrq_n_u64(s01, 24);
+ s23_f3 = vshrq_n_u64(s23, 24);
+ s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
+ vreinterpret_u32_u64(vget_high_u64(s01_f3)));
+ s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
+ vreinterpret_u32_u64(vget_high_u64(s23_f3)));
+ s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
+ d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
+ d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
+ d89_a = vmull_u8(s4_f3, filter3);
+
+ e0123 =
+ vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
+ e4567 =
+ vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
+ e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
+
+ b4 = vqrshrun_n_s16(e0123, 7);
+ b6 = vqrshrun_n_s16(e4567, 7);
+ b8 = vqrshrun_n_s16(e89, 7);
+
+ // Second pass: 4x4
+ filter = vld1_u8(abs_filters[yoffset]);
+ filter0 = vdup_lane_u8(filter, 0);
+ filter1 = vdup_lane_u8(filter, 1);
+ filter2 = vdup_lane_u8(filter, 2);
+ filter3 = vdup_lane_u8(filter, 3);
+ filter4 = vdup_lane_u8(filter, 4);
+ filter5 = vdup_lane_u8(filter, 5);
+
+ b1 = vext_u8(b0, b2, 4);
+ b3 = vext_u8(b2, b4, 4);
+ b5 = vext_u8(b4, b6, 4);
+ b7 = vext_u8(b6, b8, 4);
+
+ c0 = vmull_u8(b0, filter0);
+ c1 = vmull_u8(b2, filter0);
+ c2 = vmull_u8(b5, filter5);
+ c3 = vmull_u8(b7, filter5);
+
+ c0 = vmlsl_u8(c0, b4, filter4);
+ c1 = vmlsl_u8(c1, b6, filter4);
+ c2 = vmlsl_u8(c2, b1, filter1);
+ c3 = vmlsl_u8(c3, b3, filter1);
+
+ c0 = vmlal_u8(c0, b2, filter2);
+ c1 = vmlal_u8(c1, b4, filter2);
+ c2 = vmlal_u8(c2, b3, filter3);
+ c3 = vmlal_u8(c3, b5, filter3);
+
+ d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
+ d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
+
+ e0 = vqrshrun_n_s16(d0, 7);
+ e1 = vqrshrun_n_s16(d1, 7);
+
+ store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
+}
+
+void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset,
+ unsigned char *dst_ptr, int dst_pitch) {
+ unsigned char *src;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
+ uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
+
+ if (xoffset == 0) { // secondpass_filter8x4_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src = src_ptr - src_pixels_per_line * 2;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d27u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d28u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d29u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d30u8 = vld1_u8(src);
+
+ q3u16 = vmull_u8(d22u8, d0u8);
+ q4u16 = vmull_u8(d23u8, d0u8);
+ q5u16 = vmull_u8(d24u8, d0u8);
+ q6u16 = vmull_u8(d25u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+ q7u16 = vmull_u8(d25u8, d3u8);
+ q8u16 = vmull_u8(d26u8, d3u8);
+ q9u16 = vmull_u8(d27u8, d3u8);
+ q10u16 = vmull_u8(d28u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ else
+ src = src_ptr - 2 - (src_pixels_per_line * 2);
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+
+ q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+ q3u16 = vmull_u8(d28u8, d3u8);
+ q4u16 = vmull_u8(d29u8, d3u8);
+ q5u16 = vmull_u8(d30u8, d3u8);
+ q6u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d22u8 = vqrshrun_n_s16(q7s16, 7);
+ d23u8 = vqrshrun_n_s16(q8s16, 7);
+ d24u8 = vqrshrun_n_s16(q9s16, 7);
+ d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+ if (yoffset == 0) { // firstpass_filter8x4_only
+ vst1_u8(dst_ptr, d22u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d23u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d24u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d25u8);
+ return;
+ }
+
+ // First Pass on rest 5-line data
+ src += src_pixels_per_line;
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q7u8 = vld1q_u8(src);
+
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+ q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+ q3u16 = vmull_u8(d27u8, d3u8);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+
+ q8s16 = vqaddq_s16(q8s16, q3s16);
+ q9s16 = vqaddq_s16(q9s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q11s16 = vqaddq_s16(q11s16, q6s16);
+ q12s16 = vqaddq_s16(q12s16, q7s16);
+
+ d26u8 = vqrshrun_n_s16(q8s16, 7);
+ d27u8 = vqrshrun_n_s16(q9s16, 7);
+ d28u8 = vqrshrun_n_s16(q10s16, 7);
+ d29u8 = vqrshrun_n_s16(q11s16, 7);
+ d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+ // Second pass: 8x4
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ q3u16 = vmull_u8(d22u8, d0u8);
+ q4u16 = vmull_u8(d23u8, d0u8);
+ q5u16 = vmull_u8(d24u8, d0u8);
+ q6u16 = vmull_u8(d25u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+ q7u16 = vmull_u8(d25u8, d3u8);
+ q8u16 = vmull_u8(d26u8, d3u8);
+ q9u16 = vmull_u8(d27u8, d3u8);
+ q10u16 = vmull_u8(d28u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+}
+
+void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset,
+ unsigned char *dst_ptr, int dst_pitch) {
+ unsigned char *src, *tmpp;
+ unsigned char tmp[64];
+ int i;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
+ uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
+
+ if (xoffset == 0) { // secondpass_filter8x8_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src = src_ptr - src_pixels_per_line * 2;
+ d18u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d19u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d20u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d21u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d27u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d28u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d29u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d30u8 = vld1_u8(src);
+
+ for (i = 2; i > 0; i--) {
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+ d23u8 = d27u8;
+ d24u8 = d28u8;
+ d25u8 = d29u8;
+ d26u8 = d30u8;
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ dst_ptr += dst_pitch;
+ }
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ else
+ src = src_ptr - 2 - (src_pixels_per_line * 2);
+
+ tmpp = tmp;
+ for (i = 2; i > 0; i--) {
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+ __builtin_prefetch(src + src_pixels_per_line * 2);
+
+ q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+ q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+ q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+ d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+ q3u16 = vmull_u8(d28u8, d3u8);
+ q4u16 = vmull_u8(d29u8, d3u8);
+ q5u16 = vmull_u8(d30u8, d3u8);
+ q6u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d22u8 = vqrshrun_n_s16(q7s16, 7);
+ d23u8 = vqrshrun_n_s16(q8s16, 7);
+ d24u8 = vqrshrun_n_s16(q9s16, 7);
+ d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+ if (yoffset == 0) { // firstpass_filter8x4_only
+ vst1_u8(dst_ptr, d22u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d23u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d24u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d25u8);
+ dst_ptr += dst_pitch;
+ } else {
+ vst1_u8(tmpp, d22u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d23u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d24u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d25u8);
+ tmpp += 8;
+ }
+ }
+ if (yoffset == 0) return;
+
+ // First Pass on rest 5-line data
+ q3u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q4u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q5u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q6u8 = vld1q_u8(src);
+ src += src_pixels_per_line;
+ q7u8 = vld1q_u8(src);
+
+ q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+ q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+ q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+ q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+ q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+ q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+ q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+ d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+ d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+ d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+ d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+ d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+ q3u16 = vmull_u8(d27u8, d3u8);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+
+ q8s16 = vqaddq_s16(q8s16, q3s16);
+ q9s16 = vqaddq_s16(q9s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q11s16 = vqaddq_s16(q11s16, q6s16);
+ q12s16 = vqaddq_s16(q12s16, q7s16);
+
+ d26u8 = vqrshrun_n_s16(q8s16, 7);
+ d27u8 = vqrshrun_n_s16(q9s16, 7);
+ d28u8 = vqrshrun_n_s16(q10s16, 7);
+ d29u8 = vqrshrun_n_s16(q11s16, 7);
+ d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+ // Second pass: 8x8
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ tmpp = tmp;
+ q9u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q10u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q11u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q12u8 = vld1q_u8(tmpp);
+
+ d18u8 = vget_low_u8(q9u8);
+ d19u8 = vget_high_u8(q9u8);
+ d20u8 = vget_low_u8(q10u8);
+ d21u8 = vget_high_u8(q10u8);
+ d22u8 = vget_low_u8(q11u8);
+ d23u8 = vget_high_u8(q11u8);
+ d24u8 = vget_low_u8(q12u8);
+ d25u8 = vget_high_u8(q12u8);
+
+ for (i = 2; i > 0; i--) {
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+ d23u8 = d27u8;
+ d24u8 = d28u8;
+ d25u8 = d29u8;
+ d26u8 = d30u8;
+
+ vst1_u8(dst_ptr, d6u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d7u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d8u8);
+ dst_ptr += dst_pitch;
+ vst1_u8(dst_ptr, d9u8);
+ dst_ptr += dst_pitch;
+ }
+}
+
+void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char *src, *src_tmp, *dst, *tmpp;
+ unsigned char tmp[336];
+ int i, j;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
+ uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
+ uint8x8_t d28u8, d29u8, d30u8, d31u8;
+ int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+ uint8x16_t q3u8, q4u8;
+ uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
+ uint16x8_t q11u16, q12u16, q13u16, q15u16;
+ int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
+ int16x8_t q11s16, q12s16, q13s16, q15s16;
+
+ if (xoffset == 0) { // secondpass_filter8x8_only
+ // load second_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // load src data
+ src_tmp = src_ptr - src_pixels_per_line * 2;
+ for (i = 0; i < 2; ++i) {
+ src = src_tmp + i * 8;
+ dst = dst_ptr + i * 8;
+ d18u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d19u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d20u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d21u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d22u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ for (j = 0; j < 4; ++j) {
+ d23u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d24u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d25u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+ d26u8 = vld1_u8(src);
+ src += src_pixels_per_line;
+
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+
+ vst1_u8(dst, d6u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d7u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d8u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d9u8);
+ dst += dst_pitch;
+ }
+ }
+ return;
+ }
+
+ // load first_pass filter
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ // First pass: output_height lines x output_width columns (9x4)
+ if (yoffset == 0) { // firstpass_filter4x4_only
+ src = src_ptr - 2;
+ dst = dst_ptr;
+ for (i = 0; i < 8; ++i) {
+ d6u8 = vld1_u8(src);
+ d7u8 = vld1_u8(src + 8);
+ d8u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d9u8 = vld1_u8(src);
+ d10u8 = vld1_u8(src + 8);
+ d11u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+
+ q6u16 = vmull_u8(d6u8, d0u8);
+ q7u16 = vmull_u8(d7u8, d0u8);
+ q8u16 = vmull_u8(d9u8, d0u8);
+ q9u16 = vmull_u8(d10u8, d0u8);
+
+ d20u8 = vext_u8(d6u8, d7u8, 1);
+ d21u8 = vext_u8(d9u8, d10u8, 1);
+ d22u8 = vext_u8(d7u8, d8u8, 1);
+ d23u8 = vext_u8(d10u8, d11u8, 1);
+ d24u8 = vext_u8(d6u8, d7u8, 4);
+ d25u8 = vext_u8(d9u8, d10u8, 4);
+ d26u8 = vext_u8(d7u8, d8u8, 4);
+ d27u8 = vext_u8(d10u8, d11u8, 4);
+ d28u8 = vext_u8(d6u8, d7u8, 5);
+ d29u8 = vext_u8(d9u8, d10u8, 5);
+
+ q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
+ q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
+ q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
+ q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
+ q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
+ q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
+ q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
+ q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
+ q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+
+ d20u8 = vext_u8(d7u8, d8u8, 5);
+ d21u8 = vext_u8(d10u8, d11u8, 5);
+ d22u8 = vext_u8(d6u8, d7u8, 2);
+ d23u8 = vext_u8(d9u8, d10u8, 2);
+ d24u8 = vext_u8(d7u8, d8u8, 2);
+ d25u8 = vext_u8(d10u8, d11u8, 2);
+ d26u8 = vext_u8(d6u8, d7u8, 3);
+ d27u8 = vext_u8(d9u8, d10u8, 3);
+ d28u8 = vext_u8(d7u8, d8u8, 3);
+ d29u8 = vext_u8(d10u8, d11u8, 3);
+
+ q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
+ q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
+ q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
+ q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
+ q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
+
+ q10u16 = vmull_u8(d26u8, d3u8);
+ q11u16 = vmull_u8(d27u8, d3u8);
+ q12u16 = vmull_u8(d28u8, d3u8);
+ q15u16 = vmull_u8(d29u8, d3u8);
+
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+ q15s16 = vreinterpretq_s16_u16(q15u16);
+
+ q6s16 = vqaddq_s16(q6s16, q10s16);
+ q8s16 = vqaddq_s16(q8s16, q11s16);
+ q7s16 = vqaddq_s16(q7s16, q12s16);
+ q9s16 = vqaddq_s16(q9s16, q15s16);
+
+ d6u8 = vqrshrun_n_s16(q6s16, 7);
+ d7u8 = vqrshrun_n_s16(q7s16, 7);
+ d8u8 = vqrshrun_n_s16(q8s16, 7);
+ d9u8 = vqrshrun_n_s16(q9s16, 7);
+
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+ vst1q_u8(dst, q3u8);
+ dst += dst_pitch;
+ vst1q_u8(dst, q4u8);
+ dst += dst_pitch;
+ }
+ return;
+ }
+
+ src = src_ptr - 2 - src_pixels_per_line * 2;
+ tmpp = tmp;
+ for (i = 0; i < 7; ++i) {
+ d6u8 = vld1_u8(src);
+ d7u8 = vld1_u8(src + 8);
+ d8u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d9u8 = vld1_u8(src);
+ d10u8 = vld1_u8(src + 8);
+ d11u8 = vld1_u8(src + 16);
+ src += src_pixels_per_line;
+ d12u8 = vld1_u8(src);
+ d13u8 = vld1_u8(src + 8);
+ // Only 5 pixels are needed, avoid a potential out of bounds read.
+ d14u8 = vld1_u8(src + 13);
+ d14u8 = vext_u8(d14u8, d14u8, 3);
+ src += src_pixels_per_line;
+
+ __builtin_prefetch(src);
+ __builtin_prefetch(src + src_pixels_per_line);
+ __builtin_prefetch(src + src_pixels_per_line * 2);
+
+ q8u16 = vmull_u8(d6u8, d0u8);
+ q9u16 = vmull_u8(d7u8, d0u8);
+ q10u16 = vmull_u8(d9u8, d0u8);
+ q11u16 = vmull_u8(d10u8, d0u8);
+ q12u16 = vmull_u8(d12u8, d0u8);
+ q13u16 = vmull_u8(d13u8, d0u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 1);
+ d29u8 = vext_u8(d9u8, d10u8, 1);
+ d30u8 = vext_u8(d12u8, d13u8, 1);
+ q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+ q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
+ d28u8 = vext_u8(d7u8, d8u8, 1);
+ d29u8 = vext_u8(d10u8, d11u8, 1);
+ d30u8 = vext_u8(d13u8, d14u8, 1);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+ q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
+ q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 4);
+ d29u8 = vext_u8(d9u8, d10u8, 4);
+ d30u8 = vext_u8(d12u8, d13u8, 4);
+ q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
+ q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+ q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
+ d28u8 = vext_u8(d7u8, d8u8, 4);
+ d29u8 = vext_u8(d10u8, d11u8, 4);
+ d30u8 = vext_u8(d13u8, d14u8, 4);
+ q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+ q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
+ q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 5);
+ d29u8 = vext_u8(d9u8, d10u8, 5);
+ d30u8 = vext_u8(d12u8, d13u8, 5);
+ q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+ q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
+ d28u8 = vext_u8(d7u8, d8u8, 5);
+ d29u8 = vext_u8(d10u8, d11u8, 5);
+ d30u8 = vext_u8(d13u8, d14u8, 5);
+ q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+ q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
+ q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 2);
+ d29u8 = vext_u8(d9u8, d10u8, 2);
+ d30u8 = vext_u8(d12u8, d13u8, 2);
+ q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
+ q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+ q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
+ d28u8 = vext_u8(d7u8, d8u8, 2);
+ d29u8 = vext_u8(d10u8, d11u8, 2);
+ d30u8 = vext_u8(d13u8, d14u8, 2);
+ q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+ q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
+ q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
+
+ d28u8 = vext_u8(d6u8, d7u8, 3);
+ d29u8 = vext_u8(d9u8, d10u8, 3);
+ d30u8 = vext_u8(d12u8, d13u8, 3);
+ d15u8 = vext_u8(d7u8, d8u8, 3);
+ d31u8 = vext_u8(d10u8, d11u8, 3);
+ d6u8 = vext_u8(d13u8, d14u8, 3);
+ q4u16 = vmull_u8(d28u8, d3u8);
+ q5u16 = vmull_u8(d29u8, d3u8);
+ q6u16 = vmull_u8(d30u8, d3u8);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+ q12s16 = vreinterpretq_s16_u16(q12u16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q10s16 = vqaddq_s16(q10s16, q5s16);
+ q12s16 = vqaddq_s16(q12s16, q6s16);
+
+ q6u16 = vmull_u8(d15u8, d3u8);
+ q7u16 = vmull_u8(d31u8, d3u8);
+ q3u16 = vmull_u8(d6u8, d3u8);
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q11s16 = vreinterpretq_s16_u16(q11u16);
+ q13s16 = vreinterpretq_s16_u16(q13u16);
+ q9s16 = vqaddq_s16(q9s16, q6s16);
+ q11s16 = vqaddq_s16(q11s16, q7s16);
+ q13s16 = vqaddq_s16(q13s16, q3s16);
+
+ d6u8 = vqrshrun_n_s16(q8s16, 7);
+ d7u8 = vqrshrun_n_s16(q9s16, 7);
+ d8u8 = vqrshrun_n_s16(q10s16, 7);
+ d9u8 = vqrshrun_n_s16(q11s16, 7);
+ d10u8 = vqrshrun_n_s16(q12s16, 7);
+ d11u8 = vqrshrun_n_s16(q13s16, 7);
+
+ vst1_u8(tmpp, d6u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d7u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d8u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d9u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d10u8);
+ tmpp += 8;
+ vst1_u8(tmpp, d11u8);
+ tmpp += 8;
+ }
+
+ // Second pass: 16x16
+ dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+ d0s8 = vdup_lane_s8(dtmps8, 0);
+ d1s8 = vdup_lane_s8(dtmps8, 1);
+ d2s8 = vdup_lane_s8(dtmps8, 2);
+ d3s8 = vdup_lane_s8(dtmps8, 3);
+ d4s8 = vdup_lane_s8(dtmps8, 4);
+ d5s8 = vdup_lane_s8(dtmps8, 5);
+ d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+ d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+ d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+ d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+ d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+ d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+ for (i = 0; i < 2; ++i) {
+ dst = dst_ptr + 8 * i;
+ tmpp = tmp + 8 * i;
+ d18u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d19u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d20u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d21u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d22u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ for (j = 0; j < 4; ++j) {
+ d23u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d24u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d25u8 = vld1_u8(tmpp);
+ tmpp += 16;
+ d26u8 = vld1_u8(tmpp);
+ tmpp += 16;
+
+ q3u16 = vmull_u8(d18u8, d0u8);
+ q4u16 = vmull_u8(d19u8, d0u8);
+ q5u16 = vmull_u8(d20u8, d0u8);
+ q6u16 = vmull_u8(d21u8, d0u8);
+
+ q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+ q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+ q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+ q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+ q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+ q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+ q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+ q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+ q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+ q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+ q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+ q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+ q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+ q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+ q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+ q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+ q7u16 = vmull_u8(d21u8, d3u8);
+ q8u16 = vmull_u8(d22u8, d3u8);
+ q9u16 = vmull_u8(d23u8, d3u8);
+ q10u16 = vmull_u8(d24u8, d3u8);
+
+ q3s16 = vreinterpretq_s16_u16(q3u16);
+ q4s16 = vreinterpretq_s16_u16(q4u16);
+ q5s16 = vreinterpretq_s16_u16(q5u16);
+ q6s16 = vreinterpretq_s16_u16(q6u16);
+ q7s16 = vreinterpretq_s16_u16(q7u16);
+ q8s16 = vreinterpretq_s16_u16(q8u16);
+ q9s16 = vreinterpretq_s16_u16(q9u16);
+ q10s16 = vreinterpretq_s16_u16(q10u16);
+
+ q7s16 = vqaddq_s16(q7s16, q3s16);
+ q8s16 = vqaddq_s16(q8s16, q4s16);
+ q9s16 = vqaddq_s16(q9s16, q5s16);
+ q10s16 = vqaddq_s16(q10s16, q6s16);
+
+ d6u8 = vqrshrun_n_s16(q7s16, 7);
+ d7u8 = vqrshrun_n_s16(q8s16, 7);
+ d8u8 = vqrshrun_n_s16(q9s16, 7);
+ d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+ d18u8 = d22u8;
+ d19u8 = d23u8;
+ d20u8 = d24u8;
+ d21u8 = d25u8;
+ d22u8 = d26u8;
+
+ vst1_u8(dst, d6u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d7u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d8u8);
+ dst += dst_pitch;
+ vst1_u8(dst, d9u8);
+ dst += dst_pitch;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
new file mode 100644
index 0000000000..ebc004a048
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vp8/common/arm/loopfilter_arm.h"
+#include "vpx_ports/arm.h"
+
+static INLINE void vp8_loop_filter_neon(uint8x16_t qblimit, // flimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p3
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r) { // q1
+ uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q2s16, q11s16;
+ uint16x8_t q4u16;
+ int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
+ int8x8_t d2s8, d3s8;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q3 = vabdq_u8(q9, q8);
+ q4 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q3 = vmaxq_u8(q3, q4);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q9 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q3);
+
+ q2u8 = vabdq_u8(q5, q8);
+ q9 = vqaddq_u8(q9, q9);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q10 = vdupq_n_u8(0x80);
+ q8 = veorq_u8(q8, q10);
+ q7 = veorq_u8(q7, q10);
+ q6 = veorq_u8(q6, q10);
+ q5 = veorq_u8(q5, q10);
+
+ q2u8 = vshrq_n_u8(q2u8, 1);
+ q9 = vqaddq_u8(q9, q2u8);
+
+ q10 = vdupq_n_u8(3);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q9 = vcgeq_u8(qblimit, q9);
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+
+ q4u16 = vmovl_u8(vget_low_u8(q10));
+ q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+ q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+ q15u8 = vandq_u8(q15u8, q9);
+
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+ q9 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2s8 = vqmovn_s16(q2s16);
+ d3s8 = vqmovn_s16(q11s16);
+ q1s8 = vcombine_s8(d2s8, d3s8);
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+
+ q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
+ q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q1s8 = vshrq_n_s8(q1s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+ q1s8 = vrshrq_n_s8(q1s8, 1);
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+ q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+ q0u8 = vdupq_n_u8(0x80);
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+ src -= (pitch << 2);
+
+ q3 = vld1q_u8(src);
+ src += pitch;
+ q4 = vld1q_u8(src);
+ src += pitch;
+ q5 = vld1q_u8(src);
+ src += pitch;
+ q6 = vld1q_u8(src);
+ src += pitch;
+ q7 = vld1q_u8(src);
+ src += pitch;
+ q8 = vld1q_u8(src);
+ src += pitch;
+ q9 = vld1q_u8(src);
+ src += pitch;
+ q10 = vld1q_u8(src);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
+ q10, &q5, &q6, &q7, &q8);
+
+ src -= (pitch * 5);
+ vst1q_u8(src, q5);
+ src += pitch;
+ vst1q_u8(src, q6);
+ src += pitch;
+ vst1q_u8(src, q7);
+ src += pitch;
+ vst1q_u8(src, q8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ u -= (pitch << 2);
+ v -= (pitch << 2);
+
+ d6 = vld1_u8(u);
+ u += pitch;
+ d7 = vld1_u8(v);
+ v += pitch;
+ d8 = vld1_u8(u);
+ u += pitch;
+ d9 = vld1_u8(v);
+ v += pitch;
+ d10 = vld1_u8(u);
+ u += pitch;
+ d11 = vld1_u8(v);
+ v += pitch;
+ d12 = vld1_u8(u);
+ u += pitch;
+ d13 = vld1_u8(v);
+ v += pitch;
+ d14 = vld1_u8(u);
+ u += pitch;
+ d15 = vld1_u8(v);
+ v += pitch;
+ d16 = vld1_u8(u);
+ u += pitch;
+ d17 = vld1_u8(v);
+ v += pitch;
+ d18 = vld1_u8(u);
+ u += pitch;
+ d19 = vld1_u8(v);
+ v += pitch;
+ d20 = vld1_u8(u);
+ d21 = vld1_u8(v);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
+ q10, &q5, &q6, &q7, &q8);
+
+ u -= (pitch * 5);
+ vst1_u8(u, vget_low_u8(q5));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q6));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q7));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q8));
+
+ v -= (pitch * 5);
+ vst1_u8(v, vget_high_u8(q5));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q6));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q7));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q8));
+ return;
+}
+
+static INLINE void write_4x8(unsigned char *dst, int pitch,
+ const uint8x8x4_t result) {
+#ifdef VPX_INCOMPATIBLE_GCC
+ /*
+ * uint8x8x4_t result
+ 00 01 02 03 | 04 05 06 07
+ 10 11 12 13 | 14 15 16 17
+ 20 21 22 23 | 24 25 26 27
+ 30 31 32 33 | 34 35 36 37
+ ---
+ * after vtrn_u16
+ 00 01 20 21 | 04 05 24 25
+ 02 03 22 23 | 06 07 26 27
+ 10 11 30 31 | 14 15 34 35
+ 12 13 32 33 | 16 17 36 37
+ ---
+ * after vtrn_u8
+ 00 10 20 30 | 04 14 24 34
+ 01 11 21 31 | 05 15 25 35
+ 02 12 22 32 | 06 16 26 36
+ 03 13 23 33 | 07 17 27 37
+ */
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
+ vreinterpret_u16_u8(result.val[2]));
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
+ vreinterpret_u16_u8(result.val[3]));
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+ vreinterpret_u8_u16(r13_u16.val[0]));
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+ vreinterpret_u8_u16(r13_u16.val[1]));
+ const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
+ const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
+ const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
+ const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
+#else
+ vst4_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 7);
+#endif // VPX_INCOMPATIBLE_GCC
+}
+
+void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ unsigned char *s, *d;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ s = src - 4;
+ d6 = vld1_u8(s);
+ s += pitch;
+ d8 = vld1_u8(s);
+ s += pitch;
+ d10 = vld1_u8(s);
+ s += pitch;
+ d12 = vld1_u8(s);
+ s += pitch;
+ d14 = vld1_u8(s);
+ s += pitch;
+ d16 = vld1_u8(s);
+ s += pitch;
+ d18 = vld1_u8(s);
+ s += pitch;
+ d20 = vld1_u8(s);
+ s += pitch;
+ d7 = vld1_u8(s);
+ s += pitch;
+ d9 = vld1_u8(s);
+ s += pitch;
+ d11 = vld1_u8(s);
+ s += pitch;
+ d13 = vld1_u8(s);
+ s += pitch;
+ d15 = vld1_u8(s);
+ s += pitch;
+ d17 = vld1_u8(s);
+ s += pitch;
+ d19 = vld1_u8(s);
+ s += pitch;
+ d21 = vld1_u8(s);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
+ q10, &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+
+ d = src - 2;
+ write_4x8(d, pitch, q4ResultL);
+ d += pitch * 8;
+ write_4x8(d, pitch, q4ResultH);
+}
+
+void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ unsigned char *us, *ud;
+ unsigned char *vs, *vd;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ us = u - 4;
+ d6 = vld1_u8(us);
+ us += pitch;
+ d8 = vld1_u8(us);
+ us += pitch;
+ d10 = vld1_u8(us);
+ us += pitch;
+ d12 = vld1_u8(us);
+ us += pitch;
+ d14 = vld1_u8(us);
+ us += pitch;
+ d16 = vld1_u8(us);
+ us += pitch;
+ d18 = vld1_u8(us);
+ us += pitch;
+ d20 = vld1_u8(us);
+
+ vs = v - 4;
+ d7 = vld1_u8(vs);
+ vs += pitch;
+ d9 = vld1_u8(vs);
+ vs += pitch;
+ d11 = vld1_u8(vs);
+ vs += pitch;
+ d13 = vld1_u8(vs);
+ vs += pitch;
+ d15 = vld1_u8(vs);
+ vs += pitch;
+ d17 = vld1_u8(vs);
+ vs += pitch;
+ d19 = vld1_u8(vs);
+ vs += pitch;
+ d21 = vld1_u8(vs);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
+ q10, &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ ud = u - 2;
+ write_4x8(ud, pitch, q4ResultL);
+
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+ vd = v - 2;
+ write_4x8(vd, pitch, q4ResultH);
+}
diff --git a/media/libvpx/libvpx/vp8/common/blockd.c b/media/libvpx/libvpx/vp8/common/blockd.c
new file mode 100644
index 0000000000..22905c10a6
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/blockd.c
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+const unsigned char vp8_block2left[25] = { 0, 0, 0, 0, 1, 1, 1, 1, 2,
+ 2, 2, 2, 3, 3, 3, 3, 4, 4,
+ 5, 5, 6, 6, 7, 7, 8 };
+const unsigned char vp8_block2above[25] = { 0, 1, 2, 3, 0, 1, 2, 3, 0,
+ 1, 2, 3, 0, 1, 2, 3, 4, 5,
+ 4, 5, 6, 7, 6, 7, 8 };
diff --git a/media/libvpx/libvpx/vp8/common/blockd.h b/media/libvpx/libvpx/vp8/common/blockd.h
new file mode 100644
index 0000000000..405443449d
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/blockd.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_BLOCKD_H_
+#define VPX_VP8_COMMON_BLOCKD_H_
+
+void vpx_log(const char *format, ...);
+
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_config.h"
+#include "vpx_scale/yv12config.h"
+#include "mv.h"
+#include "treecoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*#define DCPRED 1*/
+#define DCPREDSIMTHRESH 0
+#define DCPREDCNTTHRESH 3
+
+#define MB_FEATURE_TREE_PROBS 3
+#define MAX_MB_SEGMENTS 4
+
+#define MAX_REF_LF_DELTAS 4
+#define MAX_MODE_LF_DELTAS 4
+
+/* Segment Feature Masks */
+#define SEGMENT_DELTADATA 0
+#define SEGMENT_ABSDATA 1
+
+typedef struct {
+ int r, c;
+} POS;
+
+#define PLANE_TYPE_Y_NO_DC 0
+#define PLANE_TYPE_Y2 1
+#define PLANE_TYPE_UV 2
+#define PLANE_TYPE_Y_WITH_DC 3
+
+typedef char ENTROPY_CONTEXT;
+typedef struct {
+ ENTROPY_CONTEXT y1[4];
+ ENTROPY_CONTEXT u[2];
+ ENTROPY_CONTEXT v[2];
+ ENTROPY_CONTEXT y2;
+} ENTROPY_CONTEXT_PLANES;
+
+extern const unsigned char vp8_block2left[25];
+extern const unsigned char vp8_block2above[25];
+
+#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B)
+
+typedef enum { KEY_FRAME = 0, INTER_FRAME = 1 } FRAME_TYPE;
+
+typedef enum {
+ DC_PRED, /* average of above and left pixels */
+ V_PRED, /* vertical prediction */
+ H_PRED, /* horizontal prediction */
+ TM_PRED, /* Truemotion prediction */
+ B_PRED, /* block based prediction, each block has its own prediction mode */
+
+ NEARESTMV,
+ NEARMV,
+ ZEROMV,
+ NEWMV,
+ SPLITMV,
+
+ MB_MODE_COUNT
+} MB_PREDICTION_MODE;
+
+/* Macroblock level features */
+typedef enum {
+ MB_LVL_ALT_Q = 0, /* Use alternate Quantizer .... */
+ MB_LVL_ALT_LF = 1, /* Use alternate loop filter value... */
+ MB_LVL_MAX = 2 /* Number of MB level features supported */
+
+} MB_LVL_FEATURES;
+
+/* Segment Feature Masks */
+#define SEGMENT_ALTQ 0x01
+#define SEGMENT_ALT_LF 0x02
+
+#define VP8_YMODES (B_PRED + 1)
+#define VP8_UV_MODES (TM_PRED + 1)
+
+#define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
+
+typedef enum {
+ B_DC_PRED, /* average of above and left pixels */
+ B_TM_PRED,
+
+ B_VE_PRED, /* vertical prediction */
+ B_HE_PRED, /* horizontal prediction */
+
+ B_LD_PRED,
+ B_RD_PRED,
+
+ B_VR_PRED,
+ B_VL_PRED,
+ B_HD_PRED,
+ B_HU_PRED,
+
+ LEFT4X4,
+ ABOVE4X4,
+ ZERO4X4,
+ NEW4X4,
+
+ B_MODE_COUNT
+} B_PREDICTION_MODE;
+
+#define VP8_BINTRAMODES (B_HU_PRED + 1) /* 10 */
+#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+ modes for the Y blocks to the left and above us; for interframes, there
+ is a single probability table. */
+
+union b_mode_info {
+ B_PREDICTION_MODE as_mode;
+ int_mv mv;
+};
+
+typedef enum {
+ INTRA_FRAME = 0,
+ LAST_FRAME = 1,
+ GOLDEN_FRAME = 2,
+ ALTREF_FRAME = 3,
+ MAX_REF_FRAMES = 4
+} MV_REFERENCE_FRAME;
+
+typedef struct {
+ uint8_t mode, uv_mode;
+ uint8_t ref_frame;
+ uint8_t is_4x4;
+ int_mv mv;
+
+ uint8_t partitioning;
+ /* does this mb has coefficients at all, 1=no coefficients, 0=need decode
+ tokens */
+ uint8_t mb_skip_coeff;
+ uint8_t need_to_clamp_mvs;
+ /* Which set of segmentation parameters should be used for this MB */
+ uint8_t segment_id;
+} MB_MODE_INFO;
+
+typedef struct modeinfo {
+ MB_MODE_INFO mbmi;
+ union b_mode_info bmi[16];
+} MODE_INFO;
+
+#if CONFIG_MULTI_RES_ENCODING
+/* The mb-level information needed to be stored for higher-resolution encoder */
+typedef struct {
+ MB_PREDICTION_MODE mode;
+ MV_REFERENCE_FRAME ref_frame;
+ int_mv mv;
+ int dissim; /* dissimilarity level of the macroblock */
+} LOWER_RES_MB_INFO;
+
+/* The frame-level information needed to be stored for higher-resolution
+ * encoder */
+typedef struct {
+ FRAME_TYPE frame_type;
+ int is_frame_dropped;
+ // If frame is dropped due to overshoot after encode_frame. This triggers a
+ // drop and resets rate control with Q forced to max for following frame.
+ // The check for this dropping due to overshoot is only done on lowest stream,
+ // and if set will force drop on all spatial streams for that current frame.
+ int is_frame_dropped_overshoot_maxqp;
+ // The frame rate for the lowest resolution.
+ double low_res_framerate;
+ /* The frame number of each reference frames */
+ unsigned int low_res_ref_frames[MAX_REF_FRAMES];
+ // The video frame counter value for the key frame, for lowest resolution.
+ unsigned int key_frame_counter_value;
+ // Flags to signal skipped encoding of previous and base layer stream.
+ unsigned int skip_encoding_prev_stream;
+ unsigned int skip_encoding_base_stream;
+ LOWER_RES_MB_INFO *mb_info;
+} LOWER_RES_FRAME_INFO;
+#endif
+
+typedef struct blockd {
+ short *qcoeff;
+ short *dqcoeff;
+ unsigned char *predictor;
+ short *dequant;
+
+ int offset;
+ char *eob;
+
+ union b_mode_info bmi;
+} BLOCKD;
+
+typedef void (*vp8_subpix_fn_t)(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset,
+ unsigned char *dst_ptr, int dst_pitch);
+
+typedef struct macroblockd {
+ DECLARE_ALIGNED(16, unsigned char, predictor[384]);
+ DECLARE_ALIGNED(16, short, qcoeff[400]);
+ DECLARE_ALIGNED(16, short, dqcoeff[400]);
+ DECLARE_ALIGNED(16, char, eobs[25]);
+
+ DECLARE_ALIGNED(16, short, dequant_y1[16]);
+ DECLARE_ALIGNED(16, short, dequant_y1_dc[16]);
+ DECLARE_ALIGNED(16, short, dequant_y2[16]);
+ DECLARE_ALIGNED(16, short, dequant_uv[16]);
+
+ /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
+ BLOCKD block[25];
+ int fullpixel_mask;
+
+ YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
+ YV12_BUFFER_CONFIG dst;
+
+ MODE_INFO *mode_info_context;
+ int mode_info_stride;
+
+ FRAME_TYPE frame_type;
+
+ int up_available;
+ int left_available;
+
+ unsigned char *recon_above[3];
+ unsigned char *recon_left[3];
+ int recon_left_stride[2];
+
+ /* Y,U,V,Y2 */
+ ENTROPY_CONTEXT_PLANES *above_context;
+ ENTROPY_CONTEXT_PLANES *left_context;
+
+ /* 0 indicates segmentation at MB level is not enabled. Otherwise the
+ * individual bits indicate which features are active. */
+ unsigned char segmentation_enabled;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation map. */
+ unsigned char update_mb_segmentation_map;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+ unsigned char update_mb_segmentation_data;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+ unsigned char mb_segement_abs_delta;
+
+ /* Per frame flags that define which MB level features (such as quantizer or
+ * loop filter level) */
+ /* are enabled and when enabled the proabilities used to decode the per MB
+ * flags in MB_MODE_INFO */
+ /* Probability Tree used to code Segment number */
+ vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];
+ /* Segment parameters */
+ signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+
+ /* mode_based Loop filter adjustment */
+ unsigned char mode_ref_lf_delta_enabled;
+ unsigned char mode_ref_lf_delta_update;
+
+ /* Delta values have the range +/- MAX_LOOP_FILTER */
+ signed char
+ last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
+ signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
+ /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+ signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+ signed char
+ mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+
+ /* Distance of MB away from frame edges */
+ int mb_to_left_edge;
+ int mb_to_right_edge;
+ int mb_to_top_edge;
+ int mb_to_bottom_edge;
+
+ vp8_subpix_fn_t subpixel_predict;
+ vp8_subpix_fn_t subpixel_predict8x4;
+ vp8_subpix_fn_t subpixel_predict8x8;
+ vp8_subpix_fn_t subpixel_predict16x16;
+
+ void *current_bc;
+
+ int corrupted;
+
+ struct vpx_internal_error_info error_info;
+
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+ /* This is an intermediate buffer currently used in sub-pixel motion search
+ * to keep a copy of the reference area. This buffer can be used for other
+ * purpose.
+ */
+ DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]);
+#endif
+} MACROBLOCKD;
+
+extern void vp8_build_block_doffsets(MACROBLOCKD *x);
+extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_BLOCKD_H_
diff --git a/media/libvpx/libvpx/vp8/common/coefupdateprobs.h b/media/libvpx/libvpx/vp8/common/coefupdateprobs.h
new file mode 100644
index 0000000000..b342096b55
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/coefupdateprobs.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_COEFUPDATEPROBS_H_
+#define VPX_VP8_COMMON_COEFUPDATEPROBS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Update probabilities for the nodes in the token entropy tree.
+ Generated file included by entropy.c */
+
+const vp8_prob vp8_coef_update_probs
+ [BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES] = {
+ {
+ {
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 },
+ { 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ },
+ {
+ {
+ { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 },
+ { 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 },
+ },
+ {
+ { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ },
+ {
+ {
+ { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 },
+ { 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ },
+ {
+ {
+ { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 },
+ { 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ {
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+ },
+ },
+ };
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_COEFUPDATEPROBS_H_
diff --git a/media/libvpx/libvpx/vp8/common/common.h b/media/libvpx/libvpx/vp8/common/common.h
new file mode 100644
index 0000000000..562569f9ab
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/common.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_COMMON_H_
+#define VPX_VP8_COMMON_COMMON_H_
+
+#include <assert.h>
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include "vpx_mem/vpx_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Only need this for fixed-size arrays, for structs just assign. */
+
+#define vp8_copy(Dest, Src) \
+ do { \
+ assert(sizeof(Dest) == sizeof(Src)); \
+ memcpy(Dest, Src, sizeof(Src)); \
+ } while (0)
+
+/* Use this for variably-sized arrays. */
+
+#define vp8_copy_array(Dest, Src, N) \
+ do { \
+ assert(sizeof(*(Dest)) == sizeof(*(Src))); \
+ memcpy(Dest, Src, (N) * sizeof(*(Src))); \
+ } while (0)
+
+#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest))
+
+#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest)))
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_COMMON_H_
diff --git a/media/libvpx/libvpx/vp8/common/context.c b/media/libvpx/libvpx/vp8/common/context.c
new file mode 100644
index 0000000000..3c624ae628
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/context.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "entropy.h"
+
+/* *** GENERATED FILE: DO NOT EDIT *** */
+
+#if 0
+int Contexts[vp8_coef_counter_dimen];
+
+const int default_contexts[vp8_coef_counter_dimen] =
+{
+ {
+ // Block Type ( 0 )
+ {
+ // Coeff Band ( 0 )
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ },
+ {
+ // Coeff Band ( 1 )
+ {30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593,},
+ {26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987,},
+ {10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104,},
+ },
+ {
+ // Coeff Band ( 2 )
+ {25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0,},
+ {9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294,},
+ {1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879,},
+ },
+ {
+ // Coeff Band ( 3 )
+ {26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302,},
+ { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611,},
+ },
+ {
+ // Coeff Band ( 4 )
+ {10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073,},
+ { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50,},
+ },
+ {
+ // Coeff Band ( 5 )
+ {10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362,},
+ { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190,},
+ },
+ {
+ // Coeff Band ( 6 )
+ {40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164,},
+ { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345,},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,},
+ },
+ },
+ {
+ // Block Type ( 1 )
+ {
+ // Coeff Band ( 0 )
+ {3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289,},
+ {8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914,},
+ {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620,},
+ },
+ {
+ // Coeff Band ( 1 )
+ {12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0,},
+ {11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988,},
+ {7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136,},
+ },
+ {
+ // Coeff Band ( 2 )
+ {15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0,},
+ {7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980,},
+ {1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429,},
+ },
+ {
+ // Coeff Band ( 3 )
+ {19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820,},
+ {1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679,},
+ },
+ {
+ // Coeff Band ( 4 )
+ {12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0,},
+ {4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127,},
+ { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101,},
+ },
+ {
+ // Coeff Band ( 5 )
+ {12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0,},
+ {4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157,},
+ { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198,},
+ },
+ {
+ // Coeff Band ( 6 )
+ {61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0,},
+ {15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195,},
+ { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507,},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641,},
+ { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30,},
+ },
+ },
+ {
+ // Block Type ( 2 )
+ {
+ // Coeff Band ( 0 )
+ { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798,},
+ {1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837,},
+ {1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122,},
+ },
+ {
+ // Coeff Band ( 1 )
+ {1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0,},
+ {1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063,},
+ {1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047,},
+ },
+ {
+ // Coeff Band ( 2 )
+ { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0,},
+ { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404,},
+ { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236,},
+ },
+ {
+ // Coeff Band ( 3 )
+ { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157,},
+ { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300,},
+ },
+ {
+ // Coeff Band ( 4 )
+ { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427,},
+ { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,},
+ },
+ {
+ // Coeff Band ( 5 )
+ { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652,},
+ { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30,},
+ },
+ {
+ // Coeff Band ( 6 )
+ { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517,},
+ { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ },
+ },
+ {
+ // Block Type ( 3 )
+ {
+ // Coeff Band ( 0 )
+ {2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694,},
+ {8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572,},
+ {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284,},
+ },
+ {
+ // Coeff Band ( 1 )
+ {9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0,},
+ {12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280,},
+ {10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460,},
+ },
+ {
+ // Coeff Band ( 2 )
+ {6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0,},
+ {6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539,},
+ {3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138,},
+ },
+ {
+ // Coeff Band ( 3 )
+ {11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0,},
+ {9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181,},
+ {4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267,},
+ },
+ {
+ // Coeff Band ( 4 )
+ {4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0,},
+ {3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401,},
+ {1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268,},
+ },
+ {
+ // Coeff Band ( 5 )
+ {8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0,},
+ {3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811,},
+ {1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527,},
+ },
+ {
+ // Coeff Band ( 6 )
+ {27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0,},
+ {5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954,},
+ {1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979,},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459,},
+ { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13,},
+ },
+ },
+};
+
+//Update probabilities for the nodes in the token entropy tree.
+const vp8_prob tree_update_probs[vp8_coef_tree_dimen] =
+{
+ {
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+ {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+ },
+ {
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+ {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+};
+#endif
diff --git a/media/libvpx/libvpx/vp8/common/debugmodes.c b/media/libvpx/libvpx/vp8/common/debugmodes.c
new file mode 100644
index 0000000000..27a97b260c
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/debugmodes.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include "blockd.h"
+
+void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
+ int frame) {
+ int mb_row;
+ int mb_col;
+ int mb_index = 0;
+ FILE *mvs = fopen("mvs.stt", "a");
+
+ /* print out the macroblock Y modes */
+ mb_index = 0;
+ fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; ++mb_row) {
+ for (mb_col = 0; mb_col < cols; ++mb_col) {
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
+
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+
+ mb_index = 0;
+ fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; ++mb_row) {
+ for (mb_col = 0; mb_col < cols; ++mb_col) {
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
+
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+
+ /* print out the macroblock UV modes */
+ mb_index = 0;
+ fprintf(mvs, "UV Modes for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; ++mb_row) {
+ for (mb_col = 0; mb_col < cols; ++mb_col) {
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
+
+ mb_index++;
+ }
+
+ mb_index++;
+ fprintf(mvs, "\n");
+ }
+
+ fprintf(mvs, "\n");
+
+ /* print out the block modes */
+ fprintf(mvs, "Mbs for Frame %d\n", frame);
+ {
+ int b_row;
+
+ for (b_row = 0; b_row < 4 * rows; ++b_row) {
+ int b_col;
+ int bindex;
+
+ for (b_col = 0; b_col < 4 * cols; ++b_col) {
+ mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+ bindex = (b_row & 3) * 4 + (b_col & 3);
+
+ if (mi[mb_index].mbmi.mode == B_PRED)
+ fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode);
+ else
+ fprintf(mvs, "xx ");
+ }
+
+ fprintf(mvs, "\n");
+ }
+ }
+ fprintf(mvs, "\n");
+
+ /* print out the macroblock mvs */
+ mb_index = 0;
+ fprintf(mvs, "MVs for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; ++mb_row) {
+ for (mb_col = 0; mb_col < cols; ++mb_col) {
+ fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2,
+ mi[mb_index].mbmi.mv.as_mv.col / 2);
+
+ mb_index++;
+ }
+
+ mb_index++;
+ fprintf(mvs, "\n");
+ }
+
+ fprintf(mvs, "\n");
+
+ /* print out the block modes */
+ fprintf(mvs, "MVs for Frame %d\n", frame);
+ {
+ int b_row;
+
+ for (b_row = 0; b_row < 4 * rows; ++b_row) {
+ int b_col;
+ int bindex;
+
+ for (b_col = 0; b_col < 4 * cols; ++b_col) {
+ mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+ bindex = (b_row & 3) * 4 + (b_col & 3);
+ fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row,
+ mi[mb_index].bmi[bindex].mv.as_mv.col);
+ }
+
+ fprintf(mvs, "\n");
+ }
+ }
+ fprintf(mvs, "\n");
+
+ fclose(mvs);
+}
diff --git a/media/libvpx/libvpx/vp8/common/default_coef_probs.h b/media/libvpx/libvpx/vp8/common/default_coef_probs.h
new file mode 100644
index 0000000000..b25e4a45a3
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/default_coef_probs.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_
+#define VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*Generated file, included by entropy.c*/
+
+static const vp8_prob default_coef_probs
+ [BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES] = {
+ { /* Block Type ( 0 ) */
+ { /* Coeff Band ( 0 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } },
+ { /* Coeff Band ( 1 )*/
+ { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+ { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+ { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 } },
+ { /* Coeff Band ( 2 )*/
+ { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+ { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+ { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 } },
+ { /* Coeff Band ( 3 )*/
+ { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+ { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+ { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 } },
+ { /* Coeff Band ( 4 )*/
+ { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+ { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+ { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 } },
+ { /* Coeff Band ( 5 )*/
+ { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+ { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+ { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 } },
+ { /* Coeff Band ( 6 )*/
+ { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+ { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+ { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 } },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } } },
+ { /* Block Type ( 1 ) */
+ { /* Coeff Band ( 0 )*/
+ { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+ { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+ { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 } },
+ { /* Coeff Band ( 1 )*/
+ { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+ { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+ { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 } },
+ { /* Coeff Band ( 2 )*/
+ { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+ { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+ { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 } },
+ { /* Coeff Band ( 3 )*/
+ { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+ { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+ { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 } },
+ { /* Coeff Band ( 4 )*/
+ { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+ { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+ { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 } },
+ { /* Coeff Band ( 5 )*/
+ { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+ { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+ { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 } },
+ { /* Coeff Band ( 6 )*/
+ { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+ { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+ { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 } },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 } } },
+ { /* Block Type ( 2 ) */
+ { /* Coeff Band ( 0 )*/
+ { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+ { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+ { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 } },
+ { /* Coeff Band ( 1 )*/
+ { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+ { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+ { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 } },
+ { /* Coeff Band ( 2 )*/
+ { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+ { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+ { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 } },
+ { /* Coeff Band ( 3 )*/
+ { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 } },
+ { /* Coeff Band ( 4 )*/
+ { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+ { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } },
+ { /* Coeff Band ( 5 )*/
+ { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 } },
+ { /* Coeff Band ( 6 )*/
+ { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 } },
+ { /* Coeff Band ( 7 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } } },
+ { /* Block Type ( 3 ) */
+ { /* Coeff Band ( 0 )*/
+ { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+ { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+ { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 } },
+ { /* Coeff Band ( 1 )*/
+ { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+ { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+ { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 } },
+ { /* Coeff Band ( 2 )*/
+ { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+ { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+ { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 } },
+ { /* Coeff Band ( 3 )*/
+ { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+ { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+ { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 } },
+ { /* Coeff Band ( 4 )*/
+ { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+ { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+ { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 } },
+ { /* Coeff Band ( 5 )*/
+ { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+ { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+ { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 } },
+ { /* Coeff Band ( 6 )*/
+ { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+ { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+ { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 } },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } } }
+ };
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_
diff --git a/media/libvpx/libvpx/vp8/common/dequantize.c b/media/libvpx/libvpx/vp8/common/dequantize.c
new file mode 100644
index 0000000000..8a56ae6868
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/dequantize.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequantize_b_c(BLOCKD *d, short *DQC) {
+ int i;
+ short *DQ = d->dqcoeff;
+ short *Q = d->qcoeff;
+
+ for (i = 0; i < 16; ++i) {
+ DQ[i] = Q[i] * DQC[i];
+ }
+}
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest,
+ int stride) {
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ input[i] = dq[i] * input[i];
+ }
+
+ vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
+
+ memset(input, 0, 32);
+}
diff --git a/media/libvpx/libvpx/vp8/common/entropy.c b/media/libvpx/libvpx/vp8/common/entropy.c
new file mode 100644
index 0000000000..fc4a3539fd
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/entropy.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "entropy.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "coefupdateprobs.h"
+
+DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) = {
+ 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+DECLARE_ALIGNED(16, const unsigned char,
+ vp8_coef_bands[16]) = { 0, 1, 2, 3, 6, 4, 5, 6,
+ 6, 6, 6, 6, 6, 6, 6, 7 };
+
+DECLARE_ALIGNED(16, const unsigned char,
+ vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = {
+ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0
+};
+
+DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = {
+ 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, const short,
+ vp8_default_inv_zig_zag[16]) = { 1, 2, 6, 7, 3, 5, 8, 13,
+ 4, 9, 12, 14, 10, 11, 15, 16 };
+
+/* vp8_default_zig_zag_mask generated with:
+
+ void vp8_init_scan_order_mask()
+ {
+ int i;
+
+ for (i = 0; i < 16; ++i)
+ {
+ vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i;
+ }
+
+ }
+*/
+DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) = {
+ 1, 2, 32, 64, 4, 16, 128, 4096, 8, 256, 2048, 8192, 512, 1024, 16384, -32768
+};
+
+const int vp8_mb_feature_data_bits[MB_LVL_MAX] = { 7, 6 };
+
+/* Array indices are identical to previously-existing CONTEXT_NODE indices */
+/* corresponding _CONTEXT_NODEs */
+/* clang-format off */
+const vp8_tree_index vp8_coef_tree[22] = {
+ -DCT_EOB_TOKEN, 2, /* 0 = EOB */
+ -ZERO_TOKEN, 4, /* 1 = ZERO */
+ -ONE_TOKEN, 6, /* 2 = ONE */
+ 8, 12, /* 3 = LOW_VAL */
+ -TWO_TOKEN, 10, /* 4 = TWO */
+ -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */
+ 14, 16, /* 6 = HIGH_LOW */
+ -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */
+ 18, 20, /* 8 = CAT_THREEFOUR */
+ -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */
+ -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */
+};
+/* clang-format on */
+
+/* vp8_coef_encodings generated with:
+ vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree);
+*/
+vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] = {
+ { 2, 2 }, { 6, 3 }, { 28, 5 }, { 58, 6 }, { 59, 6 }, { 60, 6 },
+ { 61, 6 }, { 124, 7 }, { 125, 7 }, { 126, 7 }, { 127, 7 }, { 0, 1 }
+};
+
+/* Trees for extra bits. Probabilities are constant and
+ do not depend on previously encoded bits */
+
+static const vp8_prob Pcat1[] = { 159 };
+static const vp8_prob Pcat2[] = { 165, 145 };
+static const vp8_prob Pcat3[] = { 173, 148, 140 };
+static const vp8_prob Pcat4[] = { 176, 155, 140, 135 };
+static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130 };
+static const vp8_prob Pcat6[] = { 254, 254, 243, 230, 196, 177,
+ 153, 140, 133, 130, 129 };
+
+/* tree index tables generated with:
+
+ void init_bit_tree(vp8_tree_index *p, int n) {
+ int i = 0;
+
+ while (++i < n) {
+ p[0] = p[1] = i << 1;
+ p += 2;
+ }
+
+ p[0] = p[1] = 0;
+ }
+
+ void init_bit_trees() {
+ init_bit_tree(cat1, 1);
+ init_bit_tree(cat2, 2);
+ init_bit_tree(cat3, 3);
+ init_bit_tree(cat4, 4);
+ init_bit_tree(cat5, 5);
+ init_bit_tree(cat6, 11);
+ }
+*/
+
+static const vp8_tree_index cat1[2] = { 0, 0 };
+static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 };
+static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 };
+static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 };
+static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 };
+static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8,
+ 10, 10, 12, 12, 14, 14, 16, 16,
+ 18, 18, 20, 20, 0, 0 };
+
+const vp8_extra_bit_struct vp8_extra_bits[12] = {
+ { 0, 0, 0, 0 }, { 0, 0, 0, 1 }, { 0, 0, 0, 2 },
+ { 0, 0, 0, 3 }, { 0, 0, 0, 4 }, { cat1, Pcat1, 1, 5 },
+ { cat2, Pcat2, 2, 7 }, { cat3, Pcat3, 3, 11 }, { cat4, Pcat4, 4, 19 },
+ { cat5, Pcat5, 5, 35 }, { cat6, Pcat6, 11, 67 }, { 0, 0, 0, 0 }
+};
+
+#include "default_coef_probs.h"
+
+void vp8_default_coef_probs(VP8_COMMON *pc) {
+ memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs));
+}
diff --git a/media/libvpx/libvpx/vp8/common/entropy.h b/media/libvpx/libvpx/vp8/common/entropy.h
new file mode 100644
index 0000000000..fbdb7bcfca
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/entropy.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_ENTROPY_H_
+#define VPX_VP8_COMMON_ENTROPY_H_
+
+#include "treecoder.h"
+#include "blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Coefficient token alphabet */
+
+#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */
+#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */
+#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */
+#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */
+#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */
+#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */
+#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */
+#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */
+#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 11+1 */
+#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */
+
+#define MAX_ENTROPY_TOKENS 12
+#define ENTROPY_NODES 11
+
+extern const vp8_tree_index vp8_coef_tree[];
+
+extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
+
+typedef struct {
+ vp8_tree_p tree;
+ const vp8_prob *prob;
+ int Len;
+ int base_val;
+} vp8_extra_bit_struct;
+
+extern const vp8_extra_bit_struct
+ vp8_extra_bits[12]; /* indexed by token value */
+
+#define PROB_UPDATE_BASELINE_COST 7
+
+#define MAX_PROB 255
+#define DCT_MAX_VALUE 2048
+
+/* Coefficients are predicted via a 3-dimensional probability table. */
+
+/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
+
+#define BLOCK_TYPES 4
+
+/* Middle dimension is a coarsening of the coefficient's
+ position within the 4x4 DCT. */
+
+#define COEF_BANDS 8
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
+
+/* Inside dimension is 3-valued measure of nearby complexity, that is,
+ the extent to which nearby coefficients are nonzero. For the first
+ coefficient (DC, unless block type is 0), we look at the (already encoded)
+ blocks above and to the left of the current block. The context index is
+ then the number (0,1,or 2) of these blocks having nonzero coefficients.
+ After decoding a coefficient, the measure is roughly the size of the
+ most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
+ Note that the intuitive meaning of this measure changes as coefficients
+ are decoded, e.g., prior to the first token, a zero means that my neighbors
+ are empty while, after the first token, because of the use of end-of-block,
+ a zero means we just decoded a zero and hence guarantees that a non-zero
+ coefficient will appear later in this block. However, this shift
+ in meaning is perfectly OK because our context depends also on the
+ coefficient band (and since zigzag positions 0, 1, and 2 are in
+ distinct bands). */
+
+/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */
+#define PREV_COEF_CONTEXTS 3
+
+extern DECLARE_ALIGNED(16, const unsigned char,
+ vp8_prev_token_class[MAX_ENTROPY_TOKENS]);
+
+extern const vp8_prob vp8_coef_update_probs[BLOCK_TYPES][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][ENTROPY_NODES];
+
+struct VP8Common;
+void vp8_default_coef_probs(struct VP8Common *);
+
+extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]);
+extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
+
+void vp8_coef_tree_initialize(void);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_ENTROPY_H_
diff --git a/media/libvpx/libvpx/vp8/common/entropymode.c b/media/libvpx/libvpx/vp8/common/entropymode.c
new file mode 100644
index 0000000000..f61e0c2e2b
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/entropymode.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#define USE_PREBUILT_TABLES
+
+#include "entropymode.h"
+#include "entropy.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp8_entropymodedata.h"
+
+int vp8_mv_cont(const int_mv *l, const int_mv *a) {
+ int lez = (l->as_int == 0);
+ int aez = (a->as_int == 0);
+ int lea = (l->as_int == a->as_int);
+
+ if (lea && lez) return SUBMVREF_LEFT_ABOVE_ZED;
+
+ if (lea) return SUBMVREF_LEFT_ABOVE_SAME;
+
+ if (aez) return SUBMVREF_ABOVE_ZED;
+
+ if (lez) return SUBMVREF_LEFT_ZED;
+
+ return SUBMVREF_NORMAL;
+}
+
+static const vp8_prob sub_mv_ref_prob[VP8_SUBMVREFS - 1] = { 180, 162, 25 };
+
+const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT][VP8_SUBMVREFS - 1] = {
+ { 147, 136, 18 },
+ { 106, 145, 1 },
+ { 179, 121, 1 },
+ { 223, 1, 34 },
+ { 208, 1, 1 }
+};
+
+const vp8_mbsplit vp8_mbsplits[VP8_NUMMBSPLITS] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 },
+ { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }
+};
+
+const int vp8_mbsplit_count[VP8_NUMMBSPLITS] = { 2, 2, 4, 16 };
+
+const vp8_prob vp8_mbsplit_probs[VP8_NUMMBSPLITS - 1] = { 110, 111, 150 };
+
+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+
+const vp8_tree_index vp8_bmode_tree[18] = /* INTRAMODECONTEXTNODE value */
+ {
+ -B_DC_PRED, 2, /* 0 = DC_NODE */
+ -B_TM_PRED, 4, /* 1 = TM_NODE */
+ -B_VE_PRED, 6, /* 2 = VE_NODE */
+ 8, 12, /* 3 = COM_NODE */
+ -B_HE_PRED, 10, /* 4 = HE_NODE */
+ -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */
+ -B_LD_PRED, 14, /* 6 = LD_NODE */
+ -B_VL_PRED, 16, /* 7 = VL_NODE */
+ -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */
+ };
+
+/* Again, these trees use the same probability indices as their
+ explicitly-programmed predecessors. */
+
+const vp8_tree_index vp8_ymode_tree[8] = {
+ -DC_PRED, 2, 4, 6, -V_PRED, -H_PRED, -TM_PRED, -B_PRED
+};
+
+const vp8_tree_index vp8_kf_ymode_tree[8] = { -B_PRED, 2, 4,
+ 6, -DC_PRED, -V_PRED,
+ -H_PRED, -TM_PRED };
+
+const vp8_tree_index vp8_uv_mode_tree[6] = { -DC_PRED, 2, -V_PRED,
+ 4, -H_PRED, -TM_PRED };
+
+const vp8_tree_index vp8_mbsplit_tree[6] = { -3, 2, -2, 4, -0, -1 };
+
+const vp8_tree_index vp8_mv_ref_tree[8] = { -ZEROMV, 2, -NEARESTMV, 4,
+ -NEARMV, 6, -NEWMV, -SPLITMV };
+
+const vp8_tree_index vp8_sub_mv_ref_tree[6] = { -LEFT4X4, 2, -ABOVE4X4,
+ 4, -ZERO4X4, -NEW4X4 };
+
+const vp8_tree_index vp8_small_mvtree[14] = { 2, 8, 4, 6, -0, -1, -2,
+ -3, 10, 12, -4, -5, -6, -7 };
+
+void vp8_init_mbmode_probs(VP8_COMMON *x) {
+ memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
+ memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
+ memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
+}
+
+void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]) {
+ memcpy(dest, vp8_bmode_prob, sizeof(vp8_bmode_prob));
+}
diff --git a/media/libvpx/libvpx/vp8/common/entropymode.h b/media/libvpx/libvpx/vp8/common/entropymode.h
new file mode 100644
index 0000000000..c772cece57
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/entropymode.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_ENTROPYMODE_H_
+#define VPX_VP8_COMMON_ENTROPYMODE_H_
+
+#include "onyxc_int.h"
+#include "treecoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ SUBMVREF_NORMAL,
+ SUBMVREF_LEFT_ZED,
+ SUBMVREF_ABOVE_ZED,
+ SUBMVREF_LEFT_ABOVE_SAME,
+ SUBMVREF_LEFT_ABOVE_ZED
+} sumvfref_t;
+
+typedef int vp8_mbsplit[16];
+
+#define VP8_NUMMBSPLITS 4
+
+extern const vp8_mbsplit vp8_mbsplits[VP8_NUMMBSPLITS];
+
+extern const int vp8_mbsplit_count[VP8_NUMMBSPLITS]; /* # of subsets */
+
+extern const vp8_prob vp8_mbsplit_probs[VP8_NUMMBSPLITS - 1];
+
+extern int vp8_mv_cont(const int_mv *l, const int_mv *a);
+#define SUBMVREF_COUNT 5
+extern const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT][VP8_SUBMVREFS - 1];
+
+extern const unsigned int vp8_kf_default_bmode_counts[VP8_BINTRAMODES]
+ [VP8_BINTRAMODES]
+ [VP8_BINTRAMODES];
+
+extern const vp8_tree_index vp8_bmode_tree[];
+
+extern const vp8_tree_index vp8_ymode_tree[];
+extern const vp8_tree_index vp8_kf_ymode_tree[];
+extern const vp8_tree_index vp8_uv_mode_tree[];
+
+extern const vp8_tree_index vp8_mbsplit_tree[];
+extern const vp8_tree_index vp8_mv_ref_tree[];
+extern const vp8_tree_index vp8_sub_mv_ref_tree[];
+
+extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES];
+extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES];
+extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES];
+extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES];
+extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS];
+
+/* Inter mode values do not start at zero */
+
+extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS];
+extern const struct vp8_token_struct
+ vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS];
+
+extern const vp8_tree_index vp8_small_mvtree[];
+
+extern const struct vp8_token_struct vp8_small_mvencodings[8];
+
+/* Key frame default mode probs */
+extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES]
+ [VP8_BINTRAMODES - 1];
+extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES - 1];
+extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES - 1];
+
+void vp8_init_mbmode_probs(VP8_COMMON *x);
+void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]);
+void vp8_kf_default_bmode_probs(
+ vp8_prob dest[VP8_BINTRAMODES][VP8_BINTRAMODES][VP8_BINTRAMODES - 1]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_ENTROPYMODE_H_
diff --git a/media/libvpx/libvpx/vp8/common/entropymv.c b/media/libvpx/libvpx/vp8/common/entropymv.c
new file mode 100644
index 0000000000..fb4f0c889f
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/entropymv.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "entropymv.h"
+
+/* clang-format off */
+const MV_CONTEXT vp8_mv_update_probs[2] = {
+ { {
+ 237,
+ 246,
+ 253, 253, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 250, 250, 252, 254, 254
+ } },
+ { {
+ 231,
+ 243,
+ 245, 253, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 251, 251, 254, 254, 254
+ } }
+};
+/* clang-format on */
+
+const MV_CONTEXT vp8_default_mv_context[2] = {
+ { {
+ /* row */
+ 162, /* is short */
+ 128, /* sign */
+ 225, 146, 172, 147, 214, 39, 156, /* short tree */
+ 128, 129, 132, 75, 145, 178, 206, 239, 254, 254 /* long bits */
+ } },
+
+ { {
+ /* same for column */
+ 164, /* is short */
+ 128, /**/
+ 204, 170, 119, 235, 140, 230, 228, /**/
+ 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 /* long bits */
+
+ } }
+};
diff --git a/media/libvpx/libvpx/vp8/common/entropymv.h b/media/libvpx/libvpx/vp8/common/entropymv.h
new file mode 100644
index 0000000000..40039f5b2c
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/entropymv.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_ENTROPYMV_H_
+#define VPX_VP8_COMMON_ENTROPYMV_H_
+
+#include "treecoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+ mv_max = 1023, /* max absolute value of a MV component */
+ MVvals = (2 * mv_max) + 1, /* # possible values "" */
+ mvfp_max = 255, /* max absolute value of a full pixel MV component */
+ MVfpvals = (2 * mvfp_max) + 1, /* # possible full pixel MV values */
+
+ mvlong_width = 10, /* Large MVs have 9 bit magnitudes */
+ mvnum_short = 8, /* magnitudes 0 through 7 */
+
+ /* probability offsets for coding each MV component */
+
+ mvpis_short = 0, /* short (<= 7) vs long (>= 8) */
+ MVPsign, /* sign for non-zero */
+ MVPshort, /* 8 short values = 7-position tree */
+
+ MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */
+ MVPcount = MVPbits + mvlong_width /* (with independent probabilities) */
+};
+
+typedef struct mv_context {
+ vp8_prob prob[MVPcount]; /* often come in row, col pairs */
+} MV_CONTEXT;
+
+extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_ENTROPYMV_H_
diff --git a/media/libvpx/libvpx/vp8/common/extend.c b/media/libvpx/libvpx/vp8/common/extend.c
new file mode 100644
index 0000000000..b52e9fe93c
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/extend.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "extend.h"
+#include "vpx_mem/vpx_mem.h"
+
+static void copy_and_extend_plane(
+ unsigned char *s, /* source */
+ int sp, /* source pitch */
+ unsigned char *d, /* destination */
+ int dp, /* destination pitch */
+ int h, /* height */
+ int w, /* width */
+ int et, /* extend top border */
+ int el, /* extend left border */
+ int eb, /* extend bottom border */
+ int er, /* extend right border */
+ int interleave_step) { /* step between pixels of the current plane */
+ int i, j;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+ int linesize;
+
+ if (interleave_step < 1) interleave_step = 1;
+
+ /* copy the left and right most columns out */
+ src_ptr1 = s;
+ src_ptr2 = s + (w - 1) * interleave_step;
+ dest_ptr1 = d - el;
+ dest_ptr2 = d + w;
+
+ for (i = 0; i < h; ++i) {
+ memset(dest_ptr1, src_ptr1[0], el);
+ if (interleave_step == 1) {
+ memcpy(dest_ptr1 + el, src_ptr1, w);
+ } else {
+ for (j = 0; j < w; j++) {
+ dest_ptr1[el + j] = src_ptr1[interleave_step * j];
+ }
+ }
+ memset(dest_ptr2, src_ptr2[0], er);
+ src_ptr1 += sp;
+ src_ptr2 += sp;
+ dest_ptr1 += dp;
+ dest_ptr2 += dp;
+ }
+
+ /* Now copy the top and bottom lines into each line of the respective
+ * borders
+ */
+ src_ptr1 = d - el;
+ src_ptr2 = d + dp * (h - 1) - el;
+ dest_ptr1 = d + dp * (-et) - el;
+ dest_ptr2 = d + dp * (h)-el;
+ linesize = el + er + w;
+
+ for (i = 0; i < et; ++i) {
+ memcpy(dest_ptr1, src_ptr1, linesize);
+ dest_ptr1 += dp;
+ }
+
+ for (i = 0; i < eb; ++i) {
+ memcpy(dest_ptr2, src_ptr2, linesize);
+ dest_ptr2 += dp;
+ }
+}
+
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
+ int et = dst->border;
+ int el = dst->border;
+ int eb = dst->border + dst->y_height - src->y_height;
+ int er = dst->border + dst->y_width - src->y_width;
+
+ // detect nv12 colorspace
+ int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1;
+
+ copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, src->y_height, src->y_width, et, el, eb,
+ er, 1);
+
+ et = dst->border >> 1;
+ el = dst->border >> 1;
+ eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
+ er = (dst->border >> 1) + dst->uv_width - src->uv_width;
+
+ copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, src->uv_height, src->uv_width, et, el,
+ eb, er, chroma_step);
+
+ copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, src->uv_height, src->uv_width, et, el,
+ eb, er, chroma_step);
+}
+
+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int srcy,
+ int srcx, int srch, int srcw) {
+ int et = dst->border;
+ int el = dst->border;
+ int eb = dst->border + dst->y_height - src->y_height;
+ int er = dst->border + dst->y_width - src->y_width;
+ int src_y_offset = srcy * src->y_stride + srcx;
+ int dst_y_offset = srcy * dst->y_stride + srcx;
+ int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+ int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+ // detect nv12 colorspace
+ int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1;
+
+ /* If the side is not touching the bounder then don't extend. */
+ if (srcy) et = 0;
+ if (srcx) el = 0;
+ if (srcy + srch != src->y_height) eb = 0;
+ if (srcx + srcw != src->y_width) er = 0;
+
+ copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
+ dst->y_buffer + dst_y_offset, dst->y_stride, srch, srcw,
+ et, el, eb, er, 1);
+
+ et = (et + 1) >> 1;
+ el = (el + 1) >> 1;
+ eb = (eb + 1) >> 1;
+ er = (er + 1) >> 1;
+ srch = (srch + 1) >> 1;
+ srcw = (srcw + 1) >> 1;
+
+ copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
+ dst->u_buffer + dst_uv_offset, dst->uv_stride, srch,
+ srcw, et, el, eb, er, chroma_step);
+
+ copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
+ dst->v_buffer + dst_uv_offset, dst->uv_stride, srch,
+ srcw, et, el, eb, er, chroma_step);
+}
+
+/* note the extension is only for the last row, for intra prediction purpose */
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
+ unsigned char *UPtr, unsigned char *VPtr) {
+ int i;
+
+ YPtr += ybf->y_stride * 14;
+ UPtr += ybf->uv_stride * 6;
+ VPtr += ybf->uv_stride * 6;
+
+ for (i = 0; i < 4; ++i) {
+ YPtr[i] = YPtr[-1];
+ UPtr[i] = UPtr[-1];
+ VPtr[i] = VPtr[-1];
+ }
+
+ YPtr += ybf->y_stride;
+ UPtr += ybf->uv_stride;
+ VPtr += ybf->uv_stride;
+
+ for (i = 0; i < 4; ++i) {
+ YPtr[i] = YPtr[-1];
+ UPtr[i] = UPtr[-1];
+ VPtr[i] = VPtr[-1];
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/extend.h b/media/libvpx/libvpx/vp8/common/extend.h
new file mode 100644
index 0000000000..586a38a4f3
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/extend.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_EXTEND_H_
+#define VPX_VP8_COMMON_EXTEND_H_
+
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
+ unsigned char *UPtr, unsigned char *VPtr);
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int srcy,
+ int srcx, int srch, int srcw);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_EXTEND_H_
diff --git a/media/libvpx/libvpx/vp8/common/filter.c b/media/libvpx/libvpx/vp8/common/filter.c
new file mode 100644
index 0000000000..267498335c
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/filter.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vp8_rtcd.h"
+#include "vp8/common/filter.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) = {
+
+ { 0, 0, 128, 0, 0,
+ 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
+ { 0, -6, 123, 12, -1, 0 },
+ { 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */
+ { 0, -9, 93, 50, -6, 0 },
+ { 3, -16, 77, 77, -16, 3 }, /* New 1/2 pel 6 tap filter */
+ { 0, -6, 50, 93, -9, 0 },
+ { 1, -8, 36, 108, -11, 2 }, /* New 1/4 pel 6 tap filter */
+ { 0, -1, 12, 123, -6, 0 },
+};
+
+static void filter_block2d_first_pass(unsigned char *src_ptr, int *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter) {
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+ ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+ ((int)src_ptr[0] * vp8_filter[2]) +
+ ((int)src_ptr[pixel_step] * vp8_filter[3]) +
+ ((int)src_ptr[2 * pixel_step] * vp8_filter[4]) +
+ ((int)src_ptr[3 * pixel_step] * vp8_filter[5]) +
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */
+
+ /* Normalize back to 0-255 */
+ Temp = Temp >> VP8_FILTER_SHIFT;
+
+ if (Temp < 0) {
+ Temp = 0;
+ } else if (Temp > 255) {
+ Temp = 255;
+ }
+
+ output_ptr[j] = Temp;
+ src_ptr++;
+ }
+
+ /* Next row... */
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+static void filter_block2d_second_pass(int *src_ptr, unsigned char *output_ptr,
+ int output_pitch,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter) {
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ /* Apply filter */
+ Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+ ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+ ((int)src_ptr[0] * vp8_filter[2]) +
+ ((int)src_ptr[pixel_step] * vp8_filter[3]) +
+ ((int)src_ptr[2 * pixel_step] * vp8_filter[4]) +
+ ((int)src_ptr[3 * pixel_step] * vp8_filter[5]) +
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */
+
+ /* Normalize back to 0-255 */
+ Temp = Temp >> VP8_FILTER_SHIFT;
+
+ if (Temp < 0) {
+ Temp = 0;
+ } else if (Temp > 255) {
+ Temp = 255;
+ }
+
+ output_ptr[j] = (unsigned char)Temp;
+ src_ptr++;
+ }
+
+ /* Start next row */
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_pitch;
+ }
+}
+
+static void filter_block2d(unsigned char *src_ptr, unsigned char *output_ptr,
+ unsigned int src_pixels_per_line, int output_pitch,
+ const short *HFilter, const short *VFilter) {
+ int FData[9 * 4]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData,
+ src_pixels_per_line, 1, 9, 4, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4,
+ VFilter);
+}
+
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,
+ VFilter);
+}
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+ int FData[13 * 16]; /* Temp data buffer used in filtering */
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData,
+ src_pixels_per_line, 1, 13, 8, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8,
+ VFilter);
+}
+
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+ int FData[13 * 16]; /* Temp data buffer used in filtering */
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData,
+ src_pixels_per_line, 1, 9, 8, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8,
+ VFilter);
+}
+
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+ int FData[21 * 24]; /* Temp data buffer used in filtering */
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData,
+ src_pixels_per_line, 1, 21, 16, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16,
+ VFilter);
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_first_pass
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_stride : Stride of source block.
+ * UINT32 height : Block height.
+ * UINT32 width : Block width.
+ * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : INT32 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
+ * in the horizontal direction to produce the filtered output
+ * block. Used to implement first-pass of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ * Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_first_pass(
+ unsigned char *src_ptr, unsigned short *dst_ptr, unsigned int src_stride,
+ unsigned int height, unsigned int width, const short *vp8_filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ /* Apply bilinear filter */
+ dst_ptr[j] =
+ (((int)src_ptr[0] * vp8_filter[0]) +
+ ((int)src_ptr[1] * vp8_filter[1]) + (VP8_FILTER_WEIGHT / 2)) >>
+ VP8_FILTER_SHIFT;
+ src_ptr++;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride - width;
+ dst_ptr += width;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_second_pass
+ *
+ * INPUTS : INT32 *src_ptr : Pointer to source block.
+ * UINT32 dst_pitch : Destination block pitch.
+ * UINT32 height : Block height.
+ * UINT32 width : Block width.
+ * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
+ * in the vertical direction to produce the filtered output
+ * block. Used to implement second-pass of 2-D separable
+ * filter.
+ *
+ * SPECIAL NOTES : Requires 32-bit input as produced by
+ * filter_block2d_bil_first_pass.
+ * Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_second_pass(unsigned short *src_ptr,
+ unsigned char *dst_ptr,
+ int dst_pitch, unsigned int height,
+ unsigned int width,
+ const short *vp8_filter) {
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ /* Apply filter */
+ Temp = ((int)src_ptr[0] * vp8_filter[0]) +
+ ((int)src_ptr[width] * vp8_filter[1]) + (VP8_FILTER_WEIGHT / 2);
+ dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+ src_ptr++;
+ }
+
+ /* Next row... */
+ dst_ptr += dst_pitch;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_pitch : Stride of source block.
+ * UINT32 dst_pitch : Stride of destination block.
+ * INT32 *HFilter : Array of 2 horizontal filter
+ * taps.
+ * INT32 *VFilter : Array of 2 vertical filter taps.
+ * INT32 Width : Block width
+ * INT32 Height : Block height
+ *
+ * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 2-D filters an input block by applying a 2-tap
+ * bi-linear filter horizontally followed by a 2-tap
+ * bi-linear filter vertically on the result.
+ *
+ * SPECIAL NOTES : The largest block size can be handled here is 16x16
+ *
+ ****************************************************************************/
+static void filter_block2d_bil(unsigned char *src_ptr, unsigned char *dst_ptr,
+ unsigned int src_pitch, unsigned int dst_pitch,
+ const short *HFilter, const short *VFilter,
+ int Width, int Height) {
+ unsigned short FData[17 * 16]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width,
+ HFilter);
+
+ /* then 1-D vertically... */
+ filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width,
+ VFilter);
+}
+
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ // This represents a copy and is not required to be handled by optimizations.
+ assert((xoffset | yoffset) != 0);
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,
+ VFilter, 4, 4);
+}
+
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ assert((xoffset | yoffset) != 0);
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,
+ VFilter, 8, 8);
+}
+
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ assert((xoffset | yoffset) != 0);
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,
+ VFilter, 8, 4);
+}
+
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ assert((xoffset | yoffset) != 0);
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,
+ VFilter, 16, 16);
+}
diff --git a/media/libvpx/libvpx/vp8/common/filter.h b/media/libvpx/libvpx/vp8/common/filter.h
new file mode 100644
index 0000000000..6acee22b21
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/filter.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_FILTER_H_
+#define VPX_VP8_COMMON_FILTER_H_
+
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT 7
+
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]);
+extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_FILTER_H_
diff --git a/media/libvpx/libvpx/vp8/common/findnearmv.c b/media/libvpx/libvpx/vp8/common/findnearmv.c
new file mode 100644
index 0000000000..3b31923621
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/findnearmv.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "findnearmv.h"
+
+const unsigned char vp8_mbsplit_offset[4][16] = {
+ { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }
+};
+
+/* Predict motion vectors using those from already-decoded nearby blocks.
+ Note that we only consider one 4x4 subblock from each candidate 16x16
+ macroblock. */
+void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
+ int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4],
+ int refframe, int *ref_frame_sign_bias) {
+ const MODE_INFO *above = here - xd->mode_info_stride;
+ const MODE_INFO *left = here - 1;
+ const MODE_INFO *aboveleft = above - 1;
+ int_mv near_mvs[4];
+ int_mv *mv = near_mvs;
+ int *cntx = near_mv_ref_cnts;
+ enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
+
+ /* Zero accumulators */
+ mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
+ near_mv_ref_cnts[0] = near_mv_ref_cnts[1] = near_mv_ref_cnts[2] =
+ near_mv_ref_cnts[3] = 0;
+
+ /* Process above */
+ if (above->mbmi.ref_frame != INTRA_FRAME) {
+ if (above->mbmi.mv.as_int) {
+ (++mv)->as_int = above->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv,
+ ref_frame_sign_bias);
+ ++cntx;
+ }
+
+ *cntx += 2;
+ }
+
+ /* Process left */
+ if (left->mbmi.ref_frame != INTRA_FRAME) {
+ if (left->mbmi.mv.as_int) {
+ int_mv this_mv;
+
+ this_mv.as_int = left->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv,
+ ref_frame_sign_bias);
+
+ if (this_mv.as_int != mv->as_int) {
+ (++mv)->as_int = this_mv.as_int;
+ ++cntx;
+ }
+
+ *cntx += 2;
+ } else {
+ near_mv_ref_cnts[CNT_INTRA] += 2;
+ }
+ }
+
+ /* Process above left */
+ if (aboveleft->mbmi.ref_frame != INTRA_FRAME) {
+ if (aboveleft->mbmi.mv.as_int) {
+ int_mv this_mv;
+
+ this_mv.as_int = aboveleft->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe,
+ &this_mv, ref_frame_sign_bias);
+
+ if (this_mv.as_int != mv->as_int) {
+ (++mv)->as_int = this_mv.as_int;
+ ++cntx;
+ }
+
+ *cntx += 1;
+ } else {
+ near_mv_ref_cnts[CNT_INTRA] += 1;
+ }
+ }
+
+ /* If we have three distinct MV's ... */
+ if (near_mv_ref_cnts[CNT_SPLITMV]) {
+ /* See if above-left MV can be merged with NEAREST */
+ if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
+ near_mv_ref_cnts[CNT_NEAREST] += 1;
+ }
+
+ near_mv_ref_cnts[CNT_SPLITMV] =
+ ((above->mbmi.mode == SPLITMV) + (left->mbmi.mode == SPLITMV)) * 2 +
+ (aboveleft->mbmi.mode == SPLITMV);
+
+ /* Swap near and nearest if necessary */
+ if (near_mv_ref_cnts[CNT_NEAR] > near_mv_ref_cnts[CNT_NEAREST]) {
+ int tmp;
+ tmp = near_mv_ref_cnts[CNT_NEAREST];
+ near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR];
+ near_mv_ref_cnts[CNT_NEAR] = tmp;
+ tmp = (int)near_mvs[CNT_NEAREST].as_int;
+ near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+ near_mvs[CNT_NEAR].as_int = (uint32_t)tmp;
+ }
+
+ /* Use near_mvs[0] to store the "best" MV */
+ if (near_mv_ref_cnts[CNT_NEAREST] >= near_mv_ref_cnts[CNT_INTRA]) {
+ near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
+ }
+
+ /* Set up return values */
+ best_mv->as_int = near_mvs[0].as_int;
+ nearest->as_int = near_mvs[CNT_NEAREST].as_int;
+ nearby->as_int = near_mvs[CNT_NEAR].as_int;
+}
+
+static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd) {
+ inv->as_mv.row = src->as_mv.row * -1;
+ inv->as_mv.col = src->as_mv.col * -1;
+ vp8_clamp_mv2(inv, xd);
+ vp8_clamp_mv2(src, xd);
+}
+
+int vp8_find_near_mvs_bias(MACROBLOCKD *xd, const MODE_INFO *here,
+ int_mv mode_mv_sb[2][MB_MODE_COUNT],
+ int_mv best_mv_sb[2], int cnt[4], int refframe,
+ int *ref_frame_sign_bias) {
+ int sign_bias = ref_frame_sign_bias[refframe];
+
+ vp8_find_near_mvs(xd, here, &mode_mv_sb[sign_bias][NEARESTMV],
+ &mode_mv_sb[sign_bias][NEARMV], &best_mv_sb[sign_bias], cnt,
+ refframe, ref_frame_sign_bias);
+
+ invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV],
+ &mode_mv_sb[sign_bias][NEARESTMV], xd);
+ invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV],
+ &mode_mv_sb[sign_bias][NEARMV], xd);
+ invert_and_clamp_mvs(&best_mv_sb[!sign_bias], &best_mv_sb[sign_bias], xd);
+
+ return sign_bias;
+}
+
+vp8_prob *vp8_mv_ref_probs(vp8_prob p[VP8_MVREFS - 1],
+ const int near_mv_ref_ct[4]) {
+ p[0] = vp8_mode_contexts[near_mv_ref_ct[0]][0];
+ p[1] = vp8_mode_contexts[near_mv_ref_ct[1]][1];
+ p[2] = vp8_mode_contexts[near_mv_ref_ct[2]][2];
+ p[3] = vp8_mode_contexts[near_mv_ref_ct[3]][3];
+ /* p[3] = vp8_mode_contexts[near_mv_ref_ct[1] + near_mv_ref_ct[2] +
+ near_mv_ref_ct[3]][3]; */
+ return p;
+}
diff --git a/media/libvpx/libvpx/vp8/common/findnearmv.h b/media/libvpx/libvpx/vp8/common/findnearmv.h
new file mode 100644
index 0000000000..d7db9544aa
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/findnearmv.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_FINDNEARMV_H_
+#define VPX_VP8_COMMON_FINDNEARMV_H_
+
+#include "./vpx_config.h"
+#include "mv.h"
+#include "blockd.h"
+#include "modecont.h"
+#include "treecoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
+ int_mv *mvp, const int *ref_frame_sign_bias) {
+ if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {
+ mvp->as_mv.row *= -1;
+ mvp->as_mv.col *= -1;
+ }
+}
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+static INLINE void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
+ if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN)) {
+ mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+ } else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN) {
+ mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+ }
+
+ if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN)) {
+ mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+ } else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN) {
+ mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+ }
+}
+
+static INLINE void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge,
+ int mb_to_right_edge, int mb_to_top_edge,
+ int mb_to_bottom_edge) {
+ mv->as_mv.col =
+ (mv->as_mv.col < mb_to_left_edge) ? mb_to_left_edge : mv->as_mv.col;
+ mv->as_mv.col =
+ (mv->as_mv.col > mb_to_right_edge) ? mb_to_right_edge : mv->as_mv.col;
+ mv->as_mv.row =
+ (mv->as_mv.row < mb_to_top_edge) ? mb_to_top_edge : mv->as_mv.row;
+ mv->as_mv.row =
+ (mv->as_mv.row > mb_to_bottom_edge) ? mb_to_bottom_edge : mv->as_mv.row;
+}
+static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
+ int mb_to_right_edge,
+ int mb_to_top_edge,
+ int mb_to_bottom_edge) {
+ unsigned int need_to_clamp;
+ need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
+ need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
+ need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
+ need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
+ return need_to_clamp;
+}
+
+void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
+ int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4],
+ int refframe, int *ref_frame_sign_bias);
+
+int vp8_find_near_mvs_bias(MACROBLOCKD *xd, const MODE_INFO *here,
+ int_mv mode_mv_sb[2][MB_MODE_COUNT],
+ int_mv best_mv_sb[2], int cnt[4], int refframe,
+ int *ref_frame_sign_bias);
+
+vp8_prob *vp8_mv_ref_probs(vp8_prob p[VP8_MVREFS - 1],
+ const int near_mv_ref_ct[4]);
+
+extern const unsigned char vp8_mbsplit_offset[4][16];
+
+static INLINE uint32_t left_block_mv(const MODE_INFO *cur_mb, int b) {
+ if (!(b & 3)) {
+ /* On L edge, get from MB to left of us */
+ --cur_mb;
+
+ if (cur_mb->mbmi.mode != SPLITMV) return cur_mb->mbmi.mv.as_int;
+ b += 4;
+ }
+
+ return (cur_mb->bmi + b - 1)->mv.as_int;
+}
+
+static INLINE uint32_t above_block_mv(const MODE_INFO *cur_mb, int b,
+ int mi_stride) {
+ if (!(b >> 2)) {
+ /* On top edge, get from MB above us */
+ cur_mb -= mi_stride;
+
+ if (cur_mb->mbmi.mode != SPLITMV) return cur_mb->mbmi.mv.as_int;
+ b += 16;
+ }
+
+ return (cur_mb->bmi + (b - 4))->mv.as_int;
+}
+static INLINE B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb,
+ int b) {
+ if (!(b & 3)) {
+ /* On L edge, get from MB to left of us */
+ --cur_mb;
+ switch (cur_mb->mbmi.mode) {
+ case B_PRED: return (cur_mb->bmi + b + 3)->as_mode;
+ case DC_PRED: return B_DC_PRED;
+ case V_PRED: return B_VE_PRED;
+ case H_PRED: return B_HE_PRED;
+ case TM_PRED: return B_TM_PRED;
+ default: return B_DC_PRED;
+ }
+ }
+
+ return (cur_mb->bmi + b - 1)->as_mode;
+}
+
+static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b,
+ int mi_stride) {
+ if (!(b >> 2)) {
+ /* On top edge, get from MB above us */
+ cur_mb -= mi_stride;
+
+ switch (cur_mb->mbmi.mode) {
+ case B_PRED: return (cur_mb->bmi + b + 12)->as_mode;
+ case DC_PRED: return B_DC_PRED;
+ case V_PRED: return B_VE_PRED;
+ case H_PRED: return B_HE_PRED;
+ case TM_PRED: return B_TM_PRED;
+ default: return B_DC_PRED;
+ }
+ }
+
+ return (cur_mb->bmi + b - 4)->as_mode;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_FINDNEARMV_H_
diff --git a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
new file mode 100644
index 0000000000..71529bdfd8
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#if VPX_ARCH_ARM
+#include "vpx_ports/arm.h"
+#elif VPX_ARCH_X86 || VPX_ARCH_X86_64
+#include "vpx_ports/x86.h"
+#elif VPX_ARCH_PPC
+#include "vpx_ports/ppc.h"
+#elif VPX_ARCH_MIPS
+#include "vpx_ports/mips.h"
+#elif VPX_ARCH_LOONGARCH
+#include "vpx_ports/loongarch.h"
+#endif
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/systemdependent.h"
+
+#if CONFIG_MULTITHREAD
+#if HAVE_UNISTD_H && !defined(__OS2__)
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+typedef void(WINAPI *PGNSI)(LPSYSTEM_INFO);
+#elif defined(__OS2__)
+#define INCL_DOS
+#define INCL_DOSSPINLOCK
+#include <os2.h>
+#endif
+#endif
+
+#if CONFIG_MULTITHREAD
+static int get_cpu_count() {
+ int core_count = 16;
+
+#if HAVE_UNISTD_H && !defined(__OS2__)
+#if defined(_SC_NPROCESSORS_ONLN)
+ core_count = (int)sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+ core_count = (int)sysconf(_SC_NPROC_ONLN);
+#endif
+#elif defined(_WIN32)
+ {
+#if _WIN32_WINNT >= 0x0501
+ SYSTEM_INFO sysinfo;
+ GetNativeSystemInfo(&sysinfo);
+#else
+ PGNSI pGNSI;
+ SYSTEM_INFO sysinfo;
+
+ /* Call GetNativeSystemInfo if supported or
+ * GetSystemInfo otherwise. */
+
+ pGNSI = (PGNSI)GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
+ "GetNativeSystemInfo");
+ if (pGNSI != NULL)
+ pGNSI(&sysinfo);
+ else
+ GetSystemInfo(&sysinfo);
+#endif
+
+ core_count = (int)sysinfo.dwNumberOfProcessors;
+ }
+#elif defined(__OS2__)
+ {
+ ULONG proc_id;
+ ULONG status;
+
+ core_count = 0;
+ for (proc_id = 1;; ++proc_id) {
+ if (DosGetProcessorStatus(proc_id, &status)) break;
+
+ if (status == PROC_ONLINE) core_count++;
+ }
+ }
+#else
+/* other platforms */
+#endif
+
+ return core_count > 0 ? core_count : 1;
+}
+#endif
+
+void vp8_machine_specific_config(VP8_COMMON *ctx) {
+#if CONFIG_MULTITHREAD
+ ctx->processor_core_count = get_cpu_count();
+#endif /* CONFIG_MULTITHREAD */
+
+#if VPX_ARCH_ARM
+ ctx->cpu_caps = arm_cpu_caps();
+#elif VPX_ARCH_X86 || VPX_ARCH_X86_64
+ ctx->cpu_caps = x86_simd_caps();
+#elif VPX_ARCH_PPC
+ ctx->cpu_caps = ppc_simd_caps();
+#elif VPX_ARCH_MIPS
+ ctx->cpu_caps = mips_cpu_caps();
+#elif VPX_ARCH_LOONGARCH
+ ctx->cpu_caps = loongarch_cpu_caps();
+#else
+ // generic-gnu targets.
+ ctx->cpu_caps = 0;
+#endif
+}
diff --git a/media/libvpx/libvpx/vp8/common/header.h b/media/libvpx/libvpx/vp8/common/header.h
new file mode 100644
index 0000000000..e64e241908
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/header.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_HEADER_H_
+#define VPX_VP8_COMMON_HEADER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* 24 bits total */
+typedef struct {
+ unsigned int type : 1;
+ unsigned int version : 3;
+ unsigned int show_frame : 1;
+
+ /* Allow 2^20 bytes = 8 megabits for first partition */
+
+ unsigned int first_partition_length_in_bytes : 19;
+
+#ifdef PACKET_TESTING
+ unsigned int frame_number;
+ unsigned int update_gold : 1;
+ unsigned int uses_gold : 1;
+ unsigned int update_last : 1;
+ unsigned int uses_last : 1;
+#endif
+
+} VP8_HEADER;
+
+#ifdef PACKET_TESTING
+#define VP8_HEADER_SIZE 8
+#else
+#define VP8_HEADER_SIZE 3
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_HEADER_H_
diff --git a/media/libvpx/libvpx/vp8/common/idct_blk.c b/media/libvpx/libvpx/vp8/common/idct_blk.c
new file mode 100644
index 0000000000..ebe1774f56
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/idct_blk.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst,
+ int stride, char *eobs) {
+ int i, j;
+
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_c(q, dq, dst, stride);
+ } else {
+ vp8_dc_only_idct_add_c(q[0] * dq[0], dst, stride, dst, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dst += 4;
+ }
+
+ dst += 4 * stride - 16;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u,
+ unsigned char *dst_v, int stride,
+ char *eobs) {
+ int i, j;
+
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < 2; ++j) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_c(q, dq, dst_u, stride);
+ } else {
+ vp8_dc_only_idct_add_c(q[0] * dq[0], dst_u, stride, dst_u, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dst_u += 4;
+ }
+
+ dst_u += 4 * stride - 8;
+ }
+
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < 2; ++j) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_c(q, dq, dst_v, stride);
+ } else {
+ vp8_dc_only_idct_add_c(q[0] * dq[0], dst_v, stride, dst_v, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dst_v += 4;
+ }
+
+ dst_v += 4 * stride - 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/idctllm.c b/media/libvpx/libvpx/vp8/common/idctllm.c
new file mode 100644
index 0000000000..2f5adc0b40
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/idctllm.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+
+/****************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point verio of two multiply
+ * constants:
+ * 1. sqrt(2) * cos (pi/8)
+ * 2. sqrt(2) * sin (pi/8)
+ * Becuase the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ * x * a = x + x*(a-1)
+ * so
+ * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ **************************************************************************/
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2 = 35468;
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride) {
+ int i;
+ int r, c;
+ int a1, b1, c1, d1;
+ short output[16];
+ short *ip = input;
+ short *op = output;
+ int temp1, temp2;
+ int shortpitch = 4;
+
+ for (i = 0; i < 4; ++i) {
+ a1 = ip[0] + ip[8];
+ b1 = ip[0] - ip[8];
+
+ temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+ temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[12] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ op[shortpitch * 0] = a1 + d1;
+ op[shortpitch * 3] = a1 - d1;
+
+ op[shortpitch * 1] = b1 + c1;
+ op[shortpitch * 2] = b1 - c1;
+
+ ip++;
+ op++;
+ }
+
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; ++i) {
+ a1 = ip[0] + ip[2];
+ b1 = ip[0] - ip[2];
+
+ temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+ temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[3] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ op[0] = (a1 + d1 + 4) >> 3;
+ op[3] = (a1 - d1 + 4) >> 3;
+
+ op[1] = (b1 + c1 + 4) >> 3;
+ op[2] = (b1 - c1 + 4) >> 3;
+
+ ip += shortpitch;
+ op += shortpitch;
+ }
+
+ ip = output;
+ for (r = 0; r < 4; ++r) {
+ for (c = 0; c < 4; ++c) {
+ int a = ip[c] + pred_ptr[c];
+
+ if (a < 0) a = 0;
+
+ if (a > 255) a = 255;
+
+ dst_ptr[c] = (unsigned char)a;
+ }
+ ip += 4;
+ dst_ptr += dst_stride;
+ pred_ptr += pred_stride;
+ }
+}
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride) {
+ int a1 = ((input_dc + 4) >> 3);
+ int r, c;
+
+ for (r = 0; r < 4; ++r) {
+ for (c = 0; c < 4; ++c) {
+ int a = a1 + pred_ptr[c];
+
+ if (a < 0) a = 0;
+
+ if (a > 255) a = 255;
+
+ dst_ptr[c] = (unsigned char)a;
+ }
+
+ dst_ptr += dst_stride;
+ pred_ptr += pred_stride;
+ }
+}
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff) {
+ short output[16];
+ int i;
+ int a1, b1, c1, d1;
+ int a2, b2, c2, d2;
+ short *ip = input;
+ short *op = output;
+
+ for (i = 0; i < 4; ++i) {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+
+ op[0] = a1 + b1;
+ op[4] = c1 + d1;
+ op[8] = a1 - b1;
+ op[12] = d1 - c1;
+ ip++;
+ op++;
+ }
+
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; ++i) {
+ a1 = ip[0] + ip[3];
+ b1 = ip[1] + ip[2];
+ c1 = ip[1] - ip[2];
+ d1 = ip[0] - ip[3];
+
+ a2 = a1 + b1;
+ b2 = c1 + d1;
+ c2 = a1 - b1;
+ d2 = d1 - c1;
+
+ op[0] = (a2 + 3) >> 3;
+ op[1] = (b2 + 3) >> 3;
+ op[2] = (c2 + 3) >> 3;
+ op[3] = (d2 + 3) >> 3;
+
+ ip += 4;
+ op += 4;
+ }
+
+ for (i = 0; i < 16; ++i) {
+ mb_dqcoeff[i * 16] = output[i];
+ }
+}
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff) {
+ int i;
+ int a1;
+
+ a1 = ((input[0] + 3) >> 3);
+ for (i = 0; i < 16; ++i) {
+ mb_dqcoeff[i * 16] = a1;
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/invtrans.h b/media/libvpx/libvpx/vp8/common/invtrans.h
new file mode 100644
index 0000000000..aed7bb0600
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/invtrans.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_INVTRANS_H_
+#define VPX_VP8_COMMON_INVTRANS_H_
+
+#include "./vpx_config.h"
+#include "vp8_rtcd.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+#if CONFIG_MULTITHREAD
+#include "vpx_mem/vpx_mem.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static void eob_adjust(char *eobs, short *diff) {
+ /* eob adjust.... the idct can only skip if both the dc and eob are zero */
+ int js;
+ for (js = 0; js < 16; ++js) {
+ if ((eobs[js] == 0) && (diff[0] != 0)) eobs[js]++;
+ diff += 16;
+ }
+}
+
+static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd) {
+ short *DQC = xd->dequant_y1;
+
+ if (xd->mode_info_context->mbmi.mode != SPLITMV) {
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1) {
+ vp8_short_inv_walsh4x4(&xd->block[24].dqcoeff[0], xd->qcoeff);
+ } else {
+ vp8_short_inv_walsh4x4_1(&xd->block[24].dqcoeff[0], xd->qcoeff);
+ }
+ eob_adjust(xd->eobs, xd->qcoeff);
+
+ DQC = xd->dequant_y1_dc;
+ }
+ vp8_dequant_idct_add_y_block(xd->qcoeff, DQC, xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_INVTRANS_H_
diff --git a/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c
new file mode 100644
index 0000000000..eee871eec4
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static const int32_t cospi8sqrt2minus1 = 20091;
+static const int32_t sinpi8sqrt2 = 35468;
+
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+ do { \
+ __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m); \
+ DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m); \
+ DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
+ DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
+ } while (0)
+
+#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+ do { \
+ __m128i s4_m, s5_m, s6_m, s7_m; \
+ \
+ TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m); \
+ DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2); \
+ out1 = __lsx_vilvh_d(s6_m, s4_m); \
+ out3 = __lsx_vilvh_d(s7_m, s5_m); \
+ } while (0)
+
+#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in0, in1) \
+ do { \
+ __m128i zero_m = __lsx_vldi(0); \
+ __m128i tmp1_m, tmp2_m; \
+ __m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \
+ \
+ tmp1_m = __lsx_vilvl_h(in0, zero_m); \
+ tmp2_m = __lsx_vilvh_h(in0, zero_m); \
+ tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \
+ tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \
+ tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m); \
+ tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \
+ tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m); \
+ tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \
+ in1 = __lsx_vpickev_h(tmp2_m, tmp1_m); \
+ } while (0)
+
+#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+ do { \
+ __m128i a1_m, b1_m, c1_m, d1_m; \
+ __m128i c_tmp1_m, c_tmp2_m; \
+ __m128i d_tmp1_m, d_tmp2_m; \
+ __m128i const_cospi8sqrt2minus1_m; \
+ \
+ const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \
+ a1_m = __lsx_vadd_h(in0, in2); \
+ b1_m = __lsx_vsub_h(in0, in2); \
+ EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1, c_tmp1_m); \
+ \
+ c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m); \
+ c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1); \
+ c_tmp2_m = __lsx_vsrai_h(c_tmp2_m, 1); \
+ c_tmp2_m = __lsx_vadd_h(in3, c_tmp2_m); \
+ c1_m = __lsx_vsub_h(c_tmp1_m, c_tmp2_m); \
+ \
+ d_tmp1_m = __lsx_vmuh_h(in1, const_cospi8sqrt2minus1_m); \
+ d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1); \
+ d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1); \
+ d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m); \
+ EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3, d_tmp2_m); \
+ d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m); \
+ LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+ } while (0)
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) \
+ do { \
+ __m128i a1_m, b1_m, c1_m, d1_m; \
+ __m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \
+ __m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m; \
+ \
+ const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_w(cospi8sqrt2minus1); \
+ sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \
+ a1_m = __lsx_vadd_w(in0, in2); \
+ b1_m = __lsx_vsub_w(in0, in2); \
+ c_tmp1_m = __lsx_vmul_w(in1, sinpi8_sqrt2_m); \
+ c_tmp1_m = __lsx_vsrai_w(c_tmp1_m, 16); \
+ c_tmp2_m = __lsx_vmul_w(in3, const_cospi8sqrt2minus1_m); \
+ c_tmp2_m = __lsx_vsrai_w(c_tmp2_m, 16); \
+ c_tmp2_m = __lsx_vadd_w(in3, c_tmp2_m); \
+ c1_m = __lsx_vsub_w(c_tmp1_m, c_tmp2_m); \
+ d_tmp1_m = __lsx_vmul_w(in1, const_cospi8sqrt2minus1_m); \
+ d_tmp1_m = __lsx_vsrai_w(d_tmp1_m, 16); \
+ d_tmp1_m = __lsx_vadd_w(in1, d_tmp1_m); \
+ d_tmp2_m = __lsx_vmul_w(in3, sinpi8_sqrt2_m); \
+ d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16); \
+ d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m); \
+ LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+ } while (0)
+
+#define UNPCK_SH_SW(in, out0, out1) \
+ do { \
+ out0 = __lsx_vsllwil_w_h(in, 0); \
+ out1 = __lsx_vexth_w_h(in); \
+ } while (0)
+
+static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred,
+ int32_t pred_stride, uint8_t *dest,
+ int32_t dest_stride) {
+ __m128i vec, res0, res1, res2, res3, dst0, dst1;
+ __m128i pred0, pred1, pred2, pred3;
+ __m128i zero = __lsx_vldi(0);
+
+ int32_t pred_stride2 = pred_stride << 1;
+ int32_t pred_stride3 = pred_stride2 + pred_stride;
+
+ vec = __lsx_vreplgr2vr_h(in_dc);
+ vec = __lsx_vsrari_h(vec, 3);
+ pred0 = __lsx_vld(pred, 0);
+ DUP2_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred1, pred2);
+ pred3 = __lsx_vldx(pred, pred_stride3);
+ DUP4_ARG2(__lsx_vilvl_b, zero, pred0, zero, pred1, zero, pred2, zero, pred3,
+ res0, res1, res2, res3);
+ DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+ res1, res2, res3);
+ res0 = __lsx_vclip255_h(res0);
+ res1 = __lsx_vclip255_h(res1);
+ res2 = __lsx_vclip255_h(res2);
+ res3 = __lsx_vclip255_h(res3);
+
+ DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, dst0, dst1);
+ dst0 = __lsx_vpickev_w(dst1, dst0);
+ __lsx_vstelm_w(dst0, dest, 0, 0);
+ dest += dest_stride;
+ __lsx_vstelm_w(dst0, dest, 0, 1);
+ dest += dest_stride;
+ __lsx_vstelm_w(dst0, dest, 0, 2);
+ dest += dest_stride;
+ __lsx_vstelm_w(dst0, dest, 0, 3);
+}
+
+void vp8_dc_only_idct_add_lsx(int16_t input_dc, uint8_t *pred_ptr,
+ int32_t pred_stride, uint8_t *dst_ptr,
+ int32_t dst_stride) {
+ idct4x4_addconst_lsx(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+static void dequant_idct4x4_addblk_2x_lsx(int16_t *input,
+ int16_t *dequant_input, uint8_t *dest,
+ int32_t dest_stride) {
+ __m128i dest0, dest1, dest2, dest3;
+ __m128i in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+ __m128i hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+ __m128i hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
+ __m128i vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
+ __m128i zero = __lsx_vldi(0);
+
+ int32_t dest_stride2 = dest_stride << 1;
+ int32_t dest_stride3 = dest_stride2 + dest_stride;
+
+ DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+ in3);
+ DUP2_ARG2(__lsx_vld, dequant_input, 0, dequant_input, 16, dequant_in0,
+ dequant_in1);
+
+ DUP4_ARG2(__lsx_vmul_h, in0, dequant_in0, in1, dequant_in1, in2, dequant_in0,
+ in3, dequant_in1, mul0, mul1, mul2, mul3);
+ DUP2_ARG2(__lsx_vpickev_d, mul2, mul0, mul3, mul1, in0, in2);
+ DUP2_ARG2(__lsx_vpickod_d, mul2, mul0, mul3, mul1, in1, in3);
+
+ VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+ TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+ UNPCK_SH_SW(hz0, hz0r, hz0l);
+ UNPCK_SH_SW(hz1, hz1r, hz1l);
+ UNPCK_SH_SW(hz2, hz2r, hz2l);
+ UNPCK_SH_SW(hz3, hz3r, hz3l);
+ VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
+ DUP4_ARG2(__lsx_vsrari_w, vt0l, 3, vt1l, 3, vt2l, 3, vt3l, 3, vt0l, vt1l,
+ vt2l, vt3l);
+ VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
+ DUP4_ARG2(__lsx_vsrari_w, vt0r, 3, vt1r, 3, vt2r, 3, vt3r, 3, vt0r, vt1r,
+ vt2r, vt3r);
+ DUP4_ARG2(__lsx_vpickev_h, vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r,
+ vt0, vt1, vt2, vt3);
+ TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+ dest0 = __lsx_vld(dest, 0);
+ DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2);
+ dest3 = __lsx_vldx(dest, dest_stride3);
+ DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+ res0, res1, res2, res3);
+ DUP4_ARG2(__lsx_vadd_h, res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0,
+ res1, res2, res3);
+
+ res0 = __lsx_vclip255_h(res0);
+ res1 = __lsx_vclip255_h(res1);
+ res2 = __lsx_vclip255_h(res2);
+ res3 = __lsx_vclip255_h(res3);
+ DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, vt0l, vt1l);
+
+ __lsx_vstelm_d(vt0l, dest, 0, 0);
+ __lsx_vstelm_d(vt0l, dest + dest_stride, 0, 1);
+ __lsx_vstelm_d(vt1l, dest + dest_stride2, 0, 0);
+ __lsx_vstelm_d(vt1l, dest + dest_stride3, 0, 1);
+
+ __lsx_vst(zero, input, 0);
+ __lsx_vst(zero, input, 16);
+ __lsx_vst(zero, input, 32);
+ __lsx_vst(zero, input, 48);
+}
+
+static void dequant_idct_addconst_2x_lsx(int16_t *input, int16_t *dequant_input,
+ uint8_t *dest, int32_t dest_stride) {
+ __m128i input_dc0, input_dc1, vec, res0, res1, res2, res3;
+ __m128i dest0, dest1, dest2, dest3;
+ __m128i zero = __lsx_vldi(0);
+ int32_t dest_stride2 = dest_stride << 1;
+ int32_t dest_stride3 = dest_stride2 + dest_stride;
+
+ input_dc0 = __lsx_vreplgr2vr_h(input[0] * dequant_input[0]);
+ input_dc1 = __lsx_vreplgr2vr_h(input[16] * dequant_input[0]);
+ DUP2_ARG2(__lsx_vsrari_h, input_dc0, 3, input_dc1, 3, input_dc0, input_dc1);
+ vec = __lsx_vpickev_d(input_dc1, input_dc0);
+ input[0] = 0;
+ input[16] = 0;
+ dest0 = __lsx_vld(dest, 0);
+ DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2);
+ dest3 = __lsx_vldx(dest, dest_stride3);
+ DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+ res0, res1, res2, res3);
+ DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+ res1, res2, res3);
+ res0 = __lsx_vclip255_h(res0);
+ res1 = __lsx_vclip255_h(res1);
+ res2 = __lsx_vclip255_h(res2);
+ res3 = __lsx_vclip255_h(res3);
+
+ DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, res0, res1);
+ __lsx_vstelm_d(res0, dest, 0, 0);
+ __lsx_vstelm_d(res0, dest + dest_stride, 0, 1);
+ __lsx_vstelm_d(res1, dest + dest_stride2, 0, 0);
+ __lsx_vstelm_d(res1, dest + dest_stride3, 0, 1);
+}
+
+void vp8_dequant_idct_add_y_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst,
+ int32_t stride, char *eobs) {
+ int16_t *eobs_h = (int16_t *)eobs;
+ uint8_t i;
+
+ for (i = 4; i--;) {
+ if (eobs_h[0]) {
+ if (eobs_h[0] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_lsx(q, dq, dst, stride);
+ } else {
+ dequant_idct_addconst_2x_lsx(q, dq, dst, stride);
+ }
+ }
+
+ q += 32;
+
+ if (eobs_h[1]) {
+ if (eobs_h[1] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_lsx(q, dq, dst + 8, stride);
+ } else {
+ dequant_idct_addconst_2x_lsx(q, dq, dst + 8, stride);
+ }
+ }
+
+ q += 32;
+ dst += (4 * stride);
+ eobs_h += 2;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst_u,
+ uint8_t *dst_v, int32_t stride,
+ char *eobs) {
+ int16_t *eobs_h = (int16_t *)eobs;
+ if (eobs_h[0]) {
+ if (eobs_h[0] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride);
+ } else {
+ dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride);
+ }
+ }
+
+ q += 32;
+ dst_u += (stride * 4);
+
+ if (eobs_h[1]) {
+ if (eobs_h[1] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride);
+ } else {
+ dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride);
+ }
+ }
+
+ q += 32;
+
+ if (eobs_h[2]) {
+ if (eobs_h[2] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride);
+ } else {
+ dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride);
+ }
+ }
+ q += 32;
+ dst_v += (stride * 4);
+
+ if (eobs_h[3]) {
+ if (eobs_h[3] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride);
+ } else {
+ dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c
new file mode 100644
index 0000000000..79c3ea6dbb
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c
@@ -0,0 +1,743 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \
+ do { \
+ __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+ const __m128i cnst4b = __lsx_vldi(4); \
+ const __m128i cnst3b = __lsx_vldi(3); \
+ \
+ p1_m = __lsx_vxori_b(p1, 0x80); \
+ p0_m = __lsx_vxori_b(p0, 0x80); \
+ q0_m = __lsx_vxori_b(q0, 0x80); \
+ q1_m = __lsx_vxori_b(q1, 0x80); \
+ \
+ filt = __lsx_vssub_b(p1_m, q1_m); \
+ filt = __lsx_vand_v(filt, hev); \
+ q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vand_v(filt, mask); \
+ t1 = __lsx_vsadd_b(filt, cnst4b); \
+ t1 = __lsx_vsra_b(t1, cnst3b); \
+ t2 = __lsx_vsadd_b(filt, cnst3b); \
+ t2 = __lsx_vsra_b(t2, cnst3b); \
+ q0_m = __lsx_vssub_b(q0_m, t1); \
+ q0 = __lsx_vxori_b(q0_m, 0x80); \
+ p0_m = __lsx_vsadd_b(p0_m, t2); \
+ p0 = __lsx_vxori_b(p0_m, 0x80); \
+ filt = __lsx_vsrari_b(t1, 1); \
+ hev = __lsx_vxori_b(hev, 0xff); \
+ filt = __lsx_vand_v(filt, hev); \
+ q1_m = __lsx_vssub_b(q1_m, filt); \
+ q1 = __lsx_vxori_b(q1_m, 0x80); \
+ p1_m = __lsx_vsadd_b(p1_m, filt); \
+ p1 = __lsx_vxori_b(p1_m, 0x80); \
+ } while (0)
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
+ do { \
+ __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \
+ __m128i u, filt, t1, t2, filt_sign, q0_sub_p0; \
+ __m128i filt_r, filt_l; \
+ __m128i temp0, temp1, temp2, temp3; \
+ const __m128i cnst4b = __lsx_vldi(4); \
+ const __m128i cnst3b = __lsx_vldi(3); \
+ const __m128i cnst9h = __lsx_vldi(1033); \
+ const __m128i cnst63h = __lsx_vldi(1087); \
+ \
+ p2_m = __lsx_vxori_b(p2, 0x80); \
+ p1_m = __lsx_vxori_b(p1, 0x80); \
+ p0_m = __lsx_vxori_b(p0, 0x80); \
+ q0_m = __lsx_vxori_b(q0, 0x80); \
+ q1_m = __lsx_vxori_b(q1, 0x80); \
+ q2_m = __lsx_vxori_b(q2, 0x80); \
+ \
+ filt = __lsx_vssub_b(p1_m, q1_m); \
+ q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vsadd_b(filt, q0_sub_p0); \
+ filt = __lsx_vand_v(filt, mask); \
+ \
+ t2 = __lsx_vand_v(filt, hev); \
+ hev = __lsx_vxori_b(hev, 0xff); \
+ filt = __lsx_vand_v(hev, filt); \
+ t1 = __lsx_vsadd_b(t2, cnst4b); \
+ t1 = __lsx_vsra_b(t1, cnst3b); \
+ t2 = __lsx_vsadd_b(t2, cnst3b); \
+ t2 = __lsx_vsra_b(t2, cnst3b); \
+ q0_m = __lsx_vssub_b(q0_m, t1); \
+ p0_m = __lsx_vsadd_b(p0_m, t2); \
+ filt_sign = __lsx_vslti_b(filt, 0); \
+ filt_r = __lsx_vilvl_b(filt_sign, filt); \
+ filt_l = __lsx_vilvh_b(filt_sign, filt); \
+ temp0 = __lsx_vmul_h(filt_r, cnst9h); \
+ temp1 = __lsx_vadd_h(temp0, cnst63h); \
+ temp2 = __lsx_vmul_h(filt_l, cnst9h); \
+ temp3 = __lsx_vadd_h(temp2, cnst63h); \
+ \
+ u = __lsx_vssrani_b_h(temp3, temp1, 7); \
+ q2_m = __lsx_vssub_b(q2_m, u); \
+ p2_m = __lsx_vsadd_b(p2_m, u); \
+ q2 = __lsx_vxori_b(q2_m, 0x80); \
+ p2 = __lsx_vxori_b(p2_m, 0x80); \
+ \
+ temp1 = __lsx_vadd_h(temp1, temp0); \
+ temp3 = __lsx_vadd_h(temp3, temp2); \
+ \
+ u = __lsx_vssrani_b_h(temp3, temp1, 7); \
+ q1_m = __lsx_vssub_b(q1_m, u); \
+ p1_m = __lsx_vsadd_b(p1_m, u); \
+ q1 = __lsx_vxori_b(q1_m, 0x80); \
+ p1 = __lsx_vxori_b(p1_m, 0x80); \
+ \
+ temp1 = __lsx_vadd_h(temp1, temp0); \
+ temp3 = __lsx_vadd_h(temp3, temp2); \
+ \
+ u = __lsx_vssrani_b_h(temp3, temp1, 7); \
+ q0_m = __lsx_vssub_b(q0_m, u); \
+ p0_m = __lsx_vsadd_b(p0_m, u); \
+ q0 = __lsx_vxori_b(q0_m, 0x80); \
+ p0 = __lsx_vxori_b(p0_m, 0x80); \
+ } while (0)
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \
+ flat_out) \
+ do { \
+ __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
+ __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
+ \
+ p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in); \
+ p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in); \
+ p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in); \
+ q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in); \
+ q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in); \
+ q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in); \
+ p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in); \
+ p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in); \
+ flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m); \
+ hev_out = __lsx_vslt_bu(thresh_in, flat_out); \
+ p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m); \
+ p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1); \
+ p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m); \
+ mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m); \
+ mask_out = __lsx_vmax_bu(flat_out, mask_out); \
+ p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m); \
+ mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out); \
+ q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m); \
+ mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \
+ mask_out = __lsx_vslt_bu(limit_in, mask_out); \
+ mask_out = __lsx_vxori_b(mask_out, 0xff); \
+ } while (0)
+
+#define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \
+ do { \
+ __lsx_vstelm_w(in0, pdst, 0, in0_idx); \
+ __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx); \
+ } while (0)
+
+static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+
+ __m128i mask, hev, flat;
+ __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+
+ DUP4_ARG2(__lsx_vldx, src, -pitch_x4, src, -pitch_x3, src, -pitch_x2, src,
+ -pitch, p3, p2, p1, p0);
+ q0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2);
+ q3 = __lsx_vldx(src, pitch_x3);
+
+ thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+ thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+ thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+ b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+ b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+ b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+ limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+ limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+ limit0 = __lsx_vilvl_d(limit1, limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+ __lsx_vstx(p1, src, -pitch_x2);
+ __lsx_vstx(p0, src, -pitch);
+ __lsx_vst(q0, src, 0);
+ __lsx_vstx(q1, src, pitch);
+}
+
+static void loop_filter_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ uint8_t *src_tmp0 = src - 4;
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+ __m128i mask, hev, flat;
+ __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+ __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ row0 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row1, row2);
+ row3 = __lsx_vldx(src_tmp0, pitch_x3);
+ src_tmp0 += pitch_x4;
+ row4 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row5, row6);
+ row7 = __lsx_vldx(src_tmp0, pitch_x3);
+ src_tmp0 += pitch_x4;
+
+ row8 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row9, row10);
+ row11 = __lsx_vldx(src_tmp0, pitch_x3);
+ src_tmp0 += pitch_x4;
+ row12 = __lsx_vld(src_tmp0, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row13, row14);
+ row15 = __lsx_vldx(src_tmp0, pitch_x3);
+
+ LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+ thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+ thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+ b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+ b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+ b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+ limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+ limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+ limit0 = __lsx_vilvl_d(limit1, limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+ DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+ tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+ DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+ tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+ src -= 2;
+ __lsx_vstelm_w(tmp2, src, 0, 0);
+ src += pitch;
+ __lsx_vstelm_w(tmp2, src, 0, 1);
+ src += pitch;
+ __lsx_vstelm_w(tmp2, src, 0, 2);
+ src += pitch;
+ __lsx_vstelm_w(tmp2, src, 0, 3);
+ src += pitch;
+
+ __lsx_vstelm_w(tmp3, src, 0, 0);
+ src += pitch;
+ __lsx_vstelm_w(tmp3, src, 0, 1);
+ src += pitch;
+ __lsx_vstelm_w(tmp3, src, 0, 2);
+ src += pitch;
+ __lsx_vstelm_w(tmp3, src, 0, 3);
+ src += pitch;
+
+ __lsx_vstelm_w(tmp4, src, 0, 0);
+ src += pitch;
+ __lsx_vstelm_w(tmp4, src, 0, 1);
+ src += pitch;
+ __lsx_vstelm_w(tmp4, src, 0, 2);
+ src += pitch;
+ __lsx_vstelm_w(tmp4, src, 0, 3);
+ src += pitch;
+
+ __lsx_vstelm_w(tmp5, src, 0, 0);
+ src += pitch;
+ __lsx_vstelm_w(tmp5, src, 0, 1);
+ src += pitch;
+ __lsx_vstelm_w(tmp5, src, 0, 2);
+ src += pitch;
+ __lsx_vstelm_w(tmp5, src, 0, 3);
+}
+
+static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch,
+ const uint8_t b_limit_in,
+ const uint8_t limit_in,
+ const uint8_t thresh_in) {
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i mask, hev, flat, thresh, limit, b_limit;
+ __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+ __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+ thresh = __lsx_vreplgr2vr_b(thresh_in);
+ limit = __lsx_vreplgr2vr_b(limit_in);
+ b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+ DUP4_ARG2(__lsx_vldx, src_u, -pitch_x4, src_u, -pitch_x3, src_u, -pitch_x2,
+ src_u, -pitch, p3_u, p2_u, p1_u, p0_u);
+ q0_u = __lsx_vld(src_u, 0);
+ DUP2_ARG2(__lsx_vldx, src_u, pitch, src_u, pitch_x2, q1_u, q2_u);
+ q3_u = __lsx_vldx(src_u, pitch_x3);
+
+ DUP4_ARG2(__lsx_vldx, src_v, -pitch_x4, src_v, -pitch_x3, src_v, -pitch_x2,
+ src_v, -pitch, p3_v, p2_v, p1_v, p0_v);
+ q0_v = __lsx_vld(src_v, 0);
+ DUP2_ARG2(__lsx_vldx, src_v, pitch, src_v, pitch_x2, q1_v, q2_v);
+ q3_v = __lsx_vldx(src_v, pitch_x3);
+
+ /* right 8 element of p3 are u pixel and
+ left 8 element of p3 are v pixel */
+ DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3,
+ p2, p1, p0);
+ DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0,
+ q1, q2, q3);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+ __lsx_vstelm_d(q1, src_u + pitch, 0, 0);
+ __lsx_vstelm_d(q0, src_u, 0, 0);
+ __lsx_vstelm_d(p0, src_u - pitch, 0, 0);
+ __lsx_vstelm_d(p1, src_u - pitch_x2, 0, 0);
+
+ __lsx_vstelm_d(q1, src_v + pitch, 0, 1);
+ __lsx_vstelm_d(q0, src_v, 0, 1);
+ __lsx_vstelm_d(p0, src_v - pitch, 0, 1);
+ __lsx_vstelm_d(p1, src_v - pitch_x2, 0, 1);
+}
+
+static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch,
+ const uint8_t b_limit_in,
+ const uint8_t limit_in,
+ const uint8_t thresh_in) {
+ uint8_t *src_u_tmp, *src_v_tmp;
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i mask, hev, flat, thresh, limit, b_limit;
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+ __m128i row9, row10, row11, row12, row13, row14, row15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ thresh = __lsx_vreplgr2vr_b(thresh_in);
+ limit = __lsx_vreplgr2vr_b(limit_in);
+ b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+ src_u_tmp = src_u - 4;
+ row0 = __lsx_vld(src_u_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row1, row2);
+ row3 = __lsx_vldx(src_u_tmp, pitch_x3);
+ src_u_tmp += pitch_x4;
+ row4 = __lsx_vld(src_u_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row5, row6);
+ row7 = __lsx_vldx(src_u_tmp, pitch_x3);
+
+ src_v_tmp = src_v - 4;
+ row8 = __lsx_vld(src_v_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row9, row10);
+ row11 = __lsx_vldx(src_v_tmp, pitch_x3);
+ src_v_tmp += pitch_x4;
+ row12 = __lsx_vld(src_v_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row13, row14);
+ row15 = __lsx_vldx(src_v_tmp, pitch_x3);
+
+ LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+ DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+ tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+
+ tmp0 = __lsx_vilvh_b(p0, p1);
+ tmp1 = __lsx_vilvh_b(q1, q0);
+ tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+ src_u_tmp += 2;
+ __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x4, 0, 0);
+ __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x3, 0, 1);
+ __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x2, 0, 2);
+ __lsx_vstelm_w(tmp2, src_u_tmp - pitch, 0, 3);
+
+ __lsx_vstelm_w(tmp3, src_u_tmp, 0, 0);
+ __lsx_vstelm_w(tmp3, src_u_tmp + pitch, 0, 1);
+ __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x2, 0, 2);
+ __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x3, 0, 3);
+
+ src_v_tmp += 2;
+ __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x4, 0, 0);
+ __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x3, 0, 1);
+ __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x2, 0, 2);
+ __lsx_vstelm_w(tmp4, src_v_tmp - pitch, 0, 3);
+
+ __lsx_vstelm_w(tmp5, src_v_tmp, 0, 0);
+ __lsx_vstelm_w(tmp5, src_v_tmp + pitch, 0, 1);
+ __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x2, 0, 2);
+ __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x3, 0, 3);
+}
+
+static inline void mbloop_filter_horizontal_edge_y_lsx(
+ uint8_t *src, int32_t pitch, const uint8_t b_limit_in,
+ const uint8_t limit_in, const uint8_t thresh_in) {
+ uint8_t *temp_src;
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i mask, hev, flat, thresh, limit, b_limit;
+
+ DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+ thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+ temp_src = src - pitch_x4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, p3, p2, p1, p0);
+ temp_src += pitch_x4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+ temp_src = src - pitch_x3;
+ __lsx_vstx(p2, temp_src, 0);
+ __lsx_vstx(p1, temp_src, pitch);
+ __lsx_vstx(p0, temp_src, pitch_x2);
+ __lsx_vstx(q0, temp_src, pitch_x3);
+ temp_src += pitch_x4;
+ __lsx_vstx(q1, temp_src, 0);
+ __lsx_vstx(q2, temp_src, pitch);
+}
+
+static inline void mbloop_filter_horizontal_edge_uv_lsx(
+ uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in,
+ const uint8_t limit_in, const uint8_t thresh_in) {
+ uint8_t *temp_src;
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i mask, hev, flat, thresh, limit, b_limit;
+ __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+ __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+ DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+ thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+ temp_src = src_u - pitch_x4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, p3_u, p2_u, p1_u, p0_u);
+ temp_src += pitch_x4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, q0_u, q1_u, q2_u, q3_u);
+ temp_src = src_v - pitch_x4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, p3_v, p2_v, p1_v, p0_v);
+ temp_src += pitch_x4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, q0_v, q1_v, q2_v, q3_v);
+
+ DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3,
+ p2, p1, p0);
+ DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0,
+ q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+ src_u -= pitch_x3;
+ __lsx_vstelm_d(p2, src_u, 0, 0);
+ __lsx_vstelm_d(p1, src_u + pitch, 0, 0);
+ __lsx_vstelm_d(p0, src_u + pitch_x2, 0, 0);
+ __lsx_vstelm_d(q0, src_u + pitch_x3, 0, 0);
+ src_u += pitch_x4;
+ __lsx_vstelm_d(q1, src_u, 0, 0);
+ src_u += pitch;
+ __lsx_vstelm_d(q2, src_u, 0, 0);
+
+ src_v -= pitch_x3;
+ __lsx_vstelm_d(p2, src_v, 0, 1);
+ __lsx_vstelm_d(p1, src_v + pitch, 0, 1);
+ __lsx_vstelm_d(p0, src_v + pitch_x2, 0, 1);
+ __lsx_vstelm_d(q0, src_v + pitch_x3, 0, 1);
+ src_v += pitch_x4;
+ __lsx_vstelm_d(q1, src_v, 0, 1);
+ src_v += pitch;
+ __lsx_vstelm_d(q2, src_v, 0, 1);
+}
+
+static inline void mbloop_filter_vertical_edge_y_lsx(uint8_t *src,
+ int32_t pitch,
+ const uint8_t b_limit_in,
+ const uint8_t limit_in,
+ const uint8_t thresh_in) {
+ uint8_t *temp_src;
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i mask, hev, flat, thresh, limit, b_limit;
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+ __m128i row9, row10, row11, row12, row13, row14, row15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+ thresh = __lsx_vldrepl_b(&thresh_in, 0);
+ temp_src = src - 4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, row0, row1, row2, row3);
+ temp_src += pitch_x4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, row4, row5, row6, row7);
+ temp_src += pitch_x4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, row8, row9, row10, row11);
+ temp_src += pitch_x4;
+ DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+ temp_src, pitch_x3, row12, row13, row14, row15);
+ temp_src -= pitch_x4;
+ LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+ DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1);
+ tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+ DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1);
+ tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+ tmp2 = __lsx_vilvl_b(q2, q1);
+ tmp5 = __lsx_vilvh_b(q2, q1);
+
+ temp_src = src - 3;
+ VP8_ST6x1_B(tmp3, 0, tmp2, 0, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp3, 1, tmp2, 1, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp3, 2, tmp2, 2, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp3, 3, tmp2, 3, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp4, 0, tmp2, 4, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp4, 1, tmp2, 5, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp4, 2, tmp2, 6, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp4, 3, tmp2, 7, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp6, 0, tmp5, 0, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp6, 1, tmp5, 1, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp6, 2, tmp5, 2, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp6, 3, tmp5, 3, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp7, 0, tmp5, 4, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp7, 1, tmp5, 5, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp7, 2, tmp5, 6, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_B(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+static inline void mbloop_filter_vertical_edge_uv_lsx(
+ uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in,
+ const uint8_t limit_in, const uint8_t thresh_in) {
+ int32_t pitch_x2 = pitch << 1;
+ int32_t pitch_x3 = pitch_x2 + pitch;
+ int32_t pitch_x4 = pitch << 2;
+ __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+ __m128i mask, hev, flat, thresh, limit, b_limit;
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+ __m128i row9, row10, row11, row12, row13, row14, row15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+ thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+ src_u -= 4;
+ DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u,
+ pitch_x3, row0, row1, row2, row3);
+ src_u += pitch_x4;
+ DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u,
+ pitch_x3, row4, row5, row6, row7);
+ src_v -= 4;
+ DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v,
+ pitch_x3, row8, row9, row10, row11);
+ src_v += pitch_x4;
+ DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v,
+ pitch_x3, row12, row13, row14, row15);
+ LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+ DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1);
+ tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+ DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1);
+ tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+ tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+ tmp2 = __lsx_vilvl_b(q2, q1);
+ tmp5 = __lsx_vilvh_b(q2, q1);
+
+ src_u += 1 - pitch_x4;
+ VP8_ST6x1_B(tmp3, 0, tmp2, 0, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_B(tmp3, 1, tmp2, 1, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_B(tmp3, 2, tmp2, 2, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_B(tmp3, 3, tmp2, 3, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_B(tmp4, 0, tmp2, 4, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_B(tmp4, 1, tmp2, 5, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_B(tmp4, 2, tmp2, 6, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_B(tmp4, 3, tmp2, 7, src_u, 4);
+
+ src_v += 1 - pitch_x4;
+ VP8_ST6x1_B(tmp6, 0, tmp5, 0, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_B(tmp6, 1, tmp5, 1, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_B(tmp6, 2, tmp5, 2, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_B(tmp6, 3, tmp5, 3, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_B(tmp7, 0, tmp5, 4, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_B(tmp7, 1, tmp5, 5, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_B(tmp7, 2, tmp5, 6, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_B(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void vp8_loop_filter_mbh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch_y, int32_t pitch_u_v,
+ loop_filter_info *lpf_info_ptr) {
+ mbloop_filter_horizontal_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim,
+ *lpf_info_ptr->lim,
+ *lpf_info_ptr->hev_thr);
+ if (src_u) {
+ mbloop_filter_horizontal_edge_uv_lsx(
+ src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+ *lpf_info_ptr->hev_thr);
+ }
+}
+
+void vp8_loop_filter_mbv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch_y, int32_t pitch_u_v,
+ loop_filter_info *lpf_info_ptr) {
+ mbloop_filter_vertical_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim,
+ *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+ if (src_u) {
+ mbloop_filter_vertical_edge_uv_lsx(src_u, src_v, pitch_u_v,
+ *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+ *lpf_info_ptr->hev_thr);
+ }
+}
+
+void vp8_loop_filter_bh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch_y, int32_t pitch_u_v,
+ loop_filter_info *lpf_info_ptr) {
+ loop_filter_horizontal_4_dual_lsx(src_y + 4 * pitch_y, pitch_y,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+ loop_filter_horizontal_4_dual_lsx(src_y + 8 * pitch_y, pitch_y,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+ loop_filter_horizontal_4_dual_lsx(src_y + 12 * pitch_y, pitch_y,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+ if (src_u) {
+ loop_filter_horizontal_edge_uv_lsx(
+ src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v,
+ *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+ }
+}
+
+void vp8_loop_filter_bv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch_y, int32_t pitch_u_v,
+ loop_filter_info *lpf_info_ptr) {
+ loop_filter_vertical_4_dual_lsx(src_y + 4, pitch_y, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr);
+ loop_filter_vertical_4_dual_lsx(src_y + 8, pitch_y, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr);
+ loop_filter_vertical_4_dual_lsx(src_y + 12, pitch_y, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr);
+ if (src_u) {
+ loop_filter_vertical_edge_uv_lsx(src_u + 4, src_v + 4, pitch_u_v,
+ *lpf_info_ptr->blim, *lpf_info_ptr->lim,
+ *lpf_info_ptr->hev_thr);
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c
new file mode 100644
index 0000000000..cd7ba54746
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c
@@ -0,0 +1,1903 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_lsx[7][8]) = {
+ { 0, -6, 123, 12, -1, 0, 0, 0 },
+ { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
+ { 0, -9, 93, 50, -6, 0, 0, 0 },
+ { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
+ { 0, -6, 50, 93, -9, 0, 0, 0 },
+ { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
+ { 0, -1, 12, 123, -6, 0, 0, 0 },
+};
+
+static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2,
+ __m128i coeff0, __m128i coeff1, __m128i coeff2) {
+ __m128i out0_m;
+
+ out0_m = __lsx_vdp2_h_b(in0, coeff0);
+ out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);
+ out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);
+
+ return out0_m;
+}
+
+static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+ __m128i mask1, __m128i mask2,
+ __m128i filt_h0, __m128i filt_h1,
+ __m128i filt_h2) {
+ __m128i vec0_m, vec1_m, vec2_m;
+ __m128i hz_out_m;
+
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+ vec1_m);
+ vec2_m = __lsx_vshuf_b(src1, src0, mask2);
+ hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);
+ hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+ hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+ return hz_out_m;
+}
+
+static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1,
+ __m128i filt0, __m128i filt1) {
+ __m128i tmp_m;
+
+ tmp_m = __lsx_vdp2_h_b(vec0, filt0);
+ tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1);
+
+ return tmp_m;
+}
+
+static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+ __m128i mask1, __m128i filt_h0,
+ __m128i filt_h1) {
+ __m128i vec0_m, vec1_m, hz_out_m;
+
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+ vec1_m);
+ hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1);
+ hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+ hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+ return hz_out_m;
+}
+
+#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ mask2, filt0, filt1, filt2, out0, out1) \
+ do { \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \
+ \
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
+ vec1_m); \
+ DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1); \
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
+ vec3_m); \
+ DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
+ out0, out1); \
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src3, src2, mask2, vec4_m, \
+ vec5_m); \
+ DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
+ out0, out1); \
+ } while (0)
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ mask2, filt0, filt1, filt2, out0, out1, \
+ out2, out3) \
+ do { \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ \
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
+ vec1_m); \
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \
+ vec3_m); \
+ DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
+ vec3_m, filt0, out0, out1, out2, out3); \
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \
+ vec1_m); \
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \
+ vec3_m); \
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, vec4_m, \
+ vec5_m); \
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, vec6_m, \
+ vec7_m); \
+ DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
+ out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \
+ out3); \
+ DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
+ out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, \
+ out3); \
+ } while (0)
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ filt0, filt1, out0, out1) \
+ do { \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \
+ \
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
+ vec1_m); \
+ DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1); \
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
+ vec3_m); \
+ DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
+ out0, out1); \
+ } while (0)
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ filt0, filt1, out0, out1, out2, out3) \
+ do { \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \
+ \
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
+ vec1_m); \
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \
+ vec3_m); \
+ DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
+ vec3_m, filt0, out0, out1, out2, out3); \
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \
+ vec1_m); \
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \
+ vec3_m); \
+ DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
+ out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \
+ out3); \
+ } while (0)
+
+static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src,
+ int32_t src_stride,
+ uint8_t *RESTRICT dst,
+ int32_t dst_stride,
+ const int8_t *filter) {
+ __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+ __m128i mask0, mask1, mask2, out0, out1;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+ src -= 2;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+ filt2 = __lsx_vldrepl_h(filter, 4);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_x3);
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+ filt1, filt2, out0, out1);
+ out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+ out0 = __lsx_vxori_b(out0, 128);
+
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+}
+
+static void common_hz_6t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+ __m128i mask0, mask1, mask2, out0, out1, out2, out3;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride_x2 << 1;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+ src -= 2;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+ filt2 = __lsx_vldrepl_h(filter, 4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_x3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ src += src_stride_x4;
+ HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+ filt1, filt2, out0, out1);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_x3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+ filt1, filt2, out2, out3);
+
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+ VP8_FILTER_SHIFT, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+
+ __lsx_vstelm_w(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_hz_6t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_hz_6t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+ __m128i mask0, mask1, mask2, tmp0, tmp1;
+ __m128i filt, out0, out1, out2, out3;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+ src -= 2;
+
+ filt = __lsx_vld(filter, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+ filt2 = __lsx_vreplvei_h(filt, 2);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ src += src_stride_x4;
+ HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+ filt1, filt2, out0, out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+ VP8_FILTER_SHIFT, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+ dst += dst_stride_x4;
+
+ for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ src += src_stride_x4;
+ HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ filt0, filt1, filt2, out0, out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+ VP8_FILTER_SHIFT, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+ dst += dst_stride_x4;
+ }
+}
+
+static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
+ __m128i mask0, mask1, mask2, out;
+ __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+ src -= 2;
+
+ filt = __lsx_vld(filter, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+ filt2 = __lsx_vreplvei_h(filt, 2);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src0, src2, src4, src6);
+ src += 8;
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src1, src3, src5, src7);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
+ src5, src6, src7);
+ src += src_stride_x4 - 8;
+
+ HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ filt0, filt1, filt2, out0, out1, out2, out3);
+ HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+ filt0, filt1, filt2, out4, out5, out6, out7);
+ DUP4_ARG2(__lsx_vsrari_h, out0, VP8_FILTER_SHIFT, out1, VP8_FILTER_SHIFT,
+ out2, VP8_FILTER_SHIFT, out3, VP8_FILTER_SHIFT, out0, out1, out2,
+ out3);
+ DUP4_ARG2(__lsx_vsrari_h, out4, VP8_FILTER_SHIFT, out5, VP8_FILTER_SHIFT,
+ out6, VP8_FILTER_SHIFT, out7, VP8_FILTER_SHIFT, out4, out5, out6,
+ out7);
+ DUP4_ARG2(__lsx_vsat_h, out0, 7, out1, 7, out2, 7, out3, 7, out0, out1,
+ out2, out3);
+ DUP4_ARG2(__lsx_vsat_h, out4, 7, out5, 7, out6, 7, out7, 7, out4, out5,
+ out6, out7);
+ out = __lsx_vpickev_b(out1, out0);
+ out = __lsx_vxori_b(out, 128);
+ __lsx_vst(out, dst, 0);
+ out = __lsx_vpickev_b(out3, out2);
+ out = __lsx_vxori_b(out, 128);
+ __lsx_vstx(out, dst, dst_stride);
+ out = __lsx_vpickev_b(out5, out4);
+ out = __lsx_vxori_b(out, 128);
+ __lsx_vstx(out, dst, dst_stride_x2);
+ out = __lsx_vpickev_b(out7, out6);
+ out = __lsx_vxori_b(out, 128);
+ __lsx_vstx(out, dst, dst_stride_x3);
+ dst += dst_stride_x4;
+ }
+}
+
+static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+ __m128i src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+ __m128i out0, out1;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+ filt2 = __lsx_vldrepl_h(filter, 4);
+
+ DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
+ src2 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
+ src += src_stride_x3;
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+ src10_r, src21_r, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
+ src4332);
+ DUP2_ARG2(__lsx_vxori_b, src2110, 128, src4332, 128, src2110, src4332);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src5 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
+ src8 = __lsx_vldx(src, src_stride_x3);
+ src += src_stride_x4;
+
+ DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+ src54_r, src65_r, src76_r, src87_r);
+ DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554,
+ src8776);
+ DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776);
+ out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2);
+ out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2);
+
+ out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+ out0 = __lsx_vxori_b(out0, 128);
+
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+
+ src2110 = src6554;
+ src4332 = src8776;
+ src4 = src8;
+ }
+}
+
+static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10;
+ __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+ __m128i src109_r, filt0, filt1, filt2;
+ __m128i tmp0, tmp1;
+ __m128i filt, out0_r, out1_r, out2_r, out3_r;
+
+ src -= src_stride_x2;
+ filt = __lsx_vld(filter, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+ filt2 = __lsx_vreplvei_h(filt, 2);
+
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src0, src1, src2, src3);
+ src += src_stride_x4;
+ src4 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ src4 = __lsx_vxori_b(src4, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4, src3,
+ src10_r, src32_r, src21_r, src43_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src7, src8, src9, src10);
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ src += src_stride_x4;
+
+ DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9,
+ src76_r, src87_r, src98_r, src109_r);
+ out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+ out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+ out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+ out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
+ out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+ dst += dst_stride_x4;
+
+ src10_r = src76_r;
+ src32_r = src98_r;
+ src21_r = src87_r;
+ src43_r = src109_r;
+ src4 = src10;
+ }
+}
+
+static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+ __m128i src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+ __m128i src65_l, src87_l, filt0, filt1, filt2;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+ src -= src_stride_x2;
+ filt = __lsx_vld(filter, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+ filt2 = __lsx_vreplvei_h(filt, 2);
+
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src0, src1, src2, src3);
+ src += src_stride_x4;
+ src4 = __lsx_vldx(src, 0);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ src4 = __lsx_vxori_b(src4, 128);
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1,
+ src10_r, src32_r, src43_r, src21_r);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1,
+ src10_l, src32_l, src43_l, src21_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src5, src6, src7, src8);
+ src += src_stride_x4;
+
+ DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+ src6, src7, src8);
+ DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+ src54_r, src65_r, src76_r, src87_r);
+ DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
+ src54_l, src65_l, src76_l, src87_l);
+ out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+ out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+ out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+ out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+ out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+ out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+ out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+ out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+ DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
+ out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
+ out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
+ tmp1, tmp2, tmp3);
+ __lsx_vstx(tmp0, dst, 0);
+ __lsx_vstx(tmp1, dst, dst_stride);
+ __lsx_vstx(tmp2, dst, dst_stride_x2);
+ __lsx_vstx(tmp3, dst, dst_stride_x3);
+ dst += dst_stride_x4;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src4 = src8;
+ }
+}
+
+static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, tmp0, tmp1;
+ __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ __m128i hz_out7, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+ src -= 2;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+ filt_hz1);
+ filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+ filt_vt1);
+ filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+ DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
+ src2 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
+ src += src_stride_x3;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ src4 = __lsx_vxori_b(src4, 128);
+
+ hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+ hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src5 = __lsx_vld(src, 0);
+ src6 = __lsx_vldx(src, src_stride);
+ src += src_stride_x2;
+
+ DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
+ hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+
+ src7 = __lsx_vld(src, 0);
+ src8 = __lsx_vldx(src, src_stride);
+ src += src_stride_x2;
+
+ DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8);
+ hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
+
+ out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+ tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+ out3 = __lsx_vpackev_b(hz_out7, hz_out6);
+ tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+ tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+ tmp0 = __lsx_vxori_b(tmp0, 128);
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 3);
+ dst += dst_stride;
+
+ hz_out3 = hz_out7;
+ out0 = out2;
+ out1 = out3;
+ }
+}
+
+static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i filt_hz0, filt_hz1, filt_hz2;
+ __m128i mask0, mask1, mask2, vec0, vec1;
+ __m128i filt, filt_vt0, filt_vt1, filt_vt2;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+ src -= (2 + src_stride_x2);
+
+ filt = __lsx_vld(filter_horiz, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+ filt_hz2 = __lsx_vreplvei_h(filt, 2);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src0, src1, src2, src3);
+ src += src_stride_x4;
+ src4 = __lsx_vldx(src, 0);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ src4 = __lsx_vxori_b(src4, 128);
+
+ hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ filt = __lsx_vld(filter_vert, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+ filt_vt2 = __lsx_vreplvei_h(filt, 2);
+
+ DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out2,
+ hz_out1, hz_out4, hz_out3, out0, out1, out3, out4);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src5, src6, src7, src8);
+ src += src_stride_x4;
+
+ DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+ src6, src7, src8);
+ hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+ tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ out5 = __lsx_vpackev_b(hz_out6, hz_out5);
+ tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ out7 = __lsx_vpackev_b(hz_out7, hz_out6);
+ tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ out6 = __lsx_vpackev_b(hz_out8, hz_out7);
+ tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2,
+ VP8_FILTER_SHIFT, vec0, vec1);
+ DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
+
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
+ dst += dst_stride_x4;
+
+ hz_out4 = hz_out8;
+ out0 = out2;
+ out1 = out7;
+ out3 = out5;
+ out4 = out6;
+ }
+}
+
+static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ common_hv_6ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ common_hv_6ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+ filter_horiz, filter_vert, height);
+}
+
+static void common_hz_4t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+ __m128i out0, out1;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+ src -= 1;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_x3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+ out0, out1);
+
+ out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+ out0 = __lsx_vxori_b(out0, 128);
+
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+}
+
+static void common_hz_4t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+ __m128i out0, out1, out2, out3;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+ src -= 1;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_x3);
+ src += src_stride_x4;
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+ out0, out1);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_x3);
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+ out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+ VP8_FILTER_SHIFT, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+
+ __lsx_vstelm_w(out1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (height == 4) {
+ common_hz_4t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+ } else if (height == 8) {
+ common_hz_4t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+ __m128i tmp0, tmp1;
+ __m128i filt, out0, out1, out2, out3;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+ src -= 1;
+
+ filt = __lsx_vld(filter, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src0, src1, src2, src3);
+ src += src_stride_x4;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+ filt1, out0, out1, out2, out3);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+ VP8_FILTER_SHIFT, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+ dst += dst_stride_x4;
+ }
+}
+
+static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i filt0, filt1, mask0, mask1;
+ __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+ src -= 1;
+
+ filt = __lsx_vld(filter, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src0, src2, src4, src6);
+ src += 8;
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src1, src3, src5, src7);
+ src += src_stride_x4 - 8;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
+ src5, src6, src7);
+ HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+ filt1, out0, out1, out2, out3);
+ HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+ filt1, out4, out5, out6, out7);
+ DUP4_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+ VP8_FILTER_SHIFT, out5, out4, VP8_FILTER_SHIFT, out7, out6,
+ VP8_FILTER_SHIFT, out0, out1, out2, out3);
+ DUP4_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out2, 128, out3, 128, out0,
+ out1, out2, out3);
+ __lsx_vstx(out0, dst, 0);
+ __lsx_vstx(out1, dst, dst_stride);
+ __lsx_vstx(out2, dst, dst_stride_x2);
+ __lsx_vstx(out3, dst, dst_stride_x3);
+ dst += dst_stride_x4;
+ }
+}
+
+static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+ __m128i src2110, src4332, filt0, filt1, out0, out1;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+ DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+ src1 = __lsx_vld(src, 0);
+ src += src_stride_x2;
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+
+ src2110 = __lsx_vilvl_d(src21_r, src10_r);
+ src2110 = __lsx_vxori_b(src2110, 128);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src3 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+ src += src_stride_x3;
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ src4332 = __lsx_vilvl_d(src43_r, src32_r);
+ src4332 = __lsx_vxori_b(src4332, 128);
+ out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1);
+
+ src2 = __lsx_vld(src, 0);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r);
+ src2110 = __lsx_vilvl_d(src65_r, src54_r);
+ src2110 = __lsx_vxori_b(src2110, 128);
+ out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1);
+ out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+ out0 = __lsx_vxori_b(out0, 128);
+
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 3);
+ dst += dst_stride;
+ }
+}
+
+static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i src0, src1, src2, src7, src8, src9, src10;
+ __m128i src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+ __m128i tmp0, tmp1;
+ __m128i filt, out0_r, out1_r, out2_r, out3_r;
+
+ src -= src_stride;
+ filt = __lsx_vld(filter, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+
+ DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+ src2 = __lsx_vldx(src, src_stride_x2);
+ src += src_stride_x3;
+
+ DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+ src2 = __lsx_vxori_b(src2, 128);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src7, src8, src9, src10);
+ src += src_stride_x4;
+
+ DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+ src8, src9, src10);
+ DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
+ src72_r, src87_r, src98_r, src109_r);
+ out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1);
+ out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1);
+ out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1);
+ out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
+ out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+ dst += dst_stride_x4;
+
+ src10_r = src98_r;
+ src21_r = src109_r;
+ src2 = src10;
+ }
+}
+
+static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+ __m128i src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+ src -= src_stride;
+ filt = __lsx_vld(filter, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+
+ DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+ src2 = __lsx_vldx(src, src_stride_x2);
+ src += src_stride_x3;
+
+ DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+ src2 = __lsx_vxori_b(src2, 128);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src3, src4, src5, src6);
+ src += src_stride_x4;
+
+ DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+ src4, src5, src6);
+ DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6, src5,
+ src32_r, src43_r, src54_r, src65_r);
+ DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5,
+ src32_l, src43_l, src54_l, src65_l);
+ out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1);
+ out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1);
+ out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1);
+ out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1);
+ out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1);
+ out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1);
+ out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1);
+ out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1);
+ DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
+ out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
+ out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
+ tmp1, tmp2, tmp3);
+ __lsx_vstx(tmp0, dst, 0);
+ __lsx_vstx(tmp1, dst, dst_stride);
+ __lsx_vstx(tmp2, dst, dst_stride_x2);
+ __lsx_vstx(tmp3, dst, dst_stride_x3);
+ dst += dst_stride_x4;
+
+ src10_r = src54_r;
+ src21_r = src65_r;
+ src10_l = src54_l;
+ src21_l = src65_l;
+ src2 = src6;
+ }
+}
+
+static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+ __m128i mask0, mask1, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+ src -= 1;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+ filt_hz1);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+ src += src_stride_x2;
+
+ DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+ src2 = __lsx_vxori_b(src2, 128);
+ hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+ filt_vt1);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src3 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+ src6 = __lsx_vldx(src, src_stride_x3);
+ src += src_stride_x4;
+
+ DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4);
+ hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
+ vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+ tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+ DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
+ hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+ vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
+ tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+ tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+ tmp0 = __lsx_vxori_b(tmp0, 128);
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 3);
+ dst += dst_stride;
+
+ hz_out1 = hz_out5;
+ vec0 = vec2;
+ }
+}
+
+static inline void common_hv_4ht_4vt_8w_lsx(
+ uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+ int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+ __m128i mask0, mask1, out0, out1;
+ __m128i filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3;
+ __m128i vec0, vec1, vec2, vec3, vec4;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+ src -= 1 + src_stride;
+
+ filt = __lsx_vld(filter_horiz, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+ src2 = __lsx_vldx(src, src_stride_x2);
+ src += src_stride_x3;
+
+ DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+ src2 = __lsx_vxori_b(src2, 128);
+ hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
+
+ filt = __lsx_vld(filter_vert, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src3, src4, src5, src6);
+ src += src_stride_x4;
+
+ DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+ src4, src5, src6);
+ hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+ vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+ tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+ hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+ vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
+ tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
+
+ hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+ vec4 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1);
+
+ hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1);
+ tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
+ dst += dst_stride_x4;
+
+ vec0 = vec4;
+ vec2 = vec1;
+ }
+}
+
+static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ common_hv_4ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ common_hv_4ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+ filter_horiz, filter_vert, height);
+}
+
+static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+ __m128i filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+ src -= 2;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+ filt_hz1);
+ filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+ src1 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+ src += src_stride_x2;
+
+ DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+ src2 = __lsx_vxori_b(src2, 128);
+
+ hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+ filt_vt1);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src3 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+ src6 = __lsx_vldx(src, src_stride_x3);
+ src += src_stride_x4;
+ DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+ src4, src5, src6);
+
+ hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
+ vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+ tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+ hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+ vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
+ tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp1, dst, 0, 1);
+ dst += dst_stride;
+
+ hz_out1 = hz_out5;
+ vec0 = vec2;
+ }
+}
+
+static inline void common_hv_6ht_4vt_8w_lsx(
+ uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+ int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+ __m128i filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+ __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+ __m128i out0, out1;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+ src -= (2 + src_stride);
+
+ filt = __lsx_vld(filter_horiz, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+ filt_hz2 = __lsx_vreplvei_h(filt, 2);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+ DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+ src2 = __lsx_vldx(src, src_stride_x2);
+ src += src_stride_x3;
+
+ DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+ src2 = __lsx_vxori_b(src2, 128);
+ hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
+
+ filt = __lsx_vld(filter_vert, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src3, src4, src5, src6);
+ src += src_stride_x4;
+ DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+ src4, src5, src6);
+
+ hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+ tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+ hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
+ tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
+
+ hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+ tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1);
+
+ hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
+ tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+ DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
+ dst += dst_stride_x4;
+ }
+}
+
+static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ common_hv_6ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ common_hv_6ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+ filter_horiz, filter_vert, height);
+}
+
+static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i filt_hz0, filt_hz1, filt_vt0, filt_vt1, filt_vt2, mask0, mask1;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ __m128i hz_out7, tmp0, tmp1, out0, out1, out2, out3;
+ __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+
+ src -= 1;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+ filt_hz1);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ DUP4_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src, src_stride,
+ src, src_stride_x2, src0, src1, src3, src4);
+ src2 = __lsx_vld(src, 0);
+ src += src_stride_x3;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ src4 = __lsx_vxori_b(src4, 128);
+ hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+ filt_vt1);
+ filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src5 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
+ src8 = __lsx_vldx(src, src_stride_x3);
+ DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+ src6, src7, src8);
+ src += src_stride_x4;
+
+ hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+ out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+ tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
+ out3 = __lsx_vpackev_b(hz_out7, hz_out6);
+ tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+ tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+ tmp0 = __lsx_vxori_b(tmp0, 128);
+ __lsx_vstelm_w(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(tmp0, dst, 0, 3);
+ dst += dst_stride;
+
+ hz_out3 = hz_out7;
+ out0 = out2;
+ out1 = out3;
+ }
+}
+
+static inline void common_hv_4ht_6vt_8w_lsx(
+ uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+ int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ int32_t src_stride_x2 = src_stride << 1;
+ int32_t src_stride_x3 = src_stride_x2 + src_stride;
+ int32_t src_stride_x4 = src_stride << 2;
+ int32_t dst_stride_x2 = dst_stride << 1;
+ int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+ int32_t dst_stride_x4 = dst_stride << 2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i filt_hz0, filt_hz1, mask0, mask1;
+ __m128i filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
+ __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i vec0, vec1;
+
+ mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+ src -= 1 + src_stride_x2;
+
+ filt = __lsx_vld(filter_horiz, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src0, src1, src2, src3);
+ src += src_stride_x4;
+ src4 = __lsx_vld(src, 0);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+ src1, src2, src3);
+ src4 = __lsx_vxori_b(src4, 128);
+ hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
+
+ filt = __lsx_vld(filter_vert, 0);
+ DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+ filt_vt2 = __lsx_vreplvei_h(filt, 2);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+ src_stride_x3, src5, src6, src7, src8);
+ src += src_stride_x4;
+
+ DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+ src6, src7, src8);
+ hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+ out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+ tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+ out5 = __lsx_vpackev_b(hz_out6, hz_out5);
+ tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+ out6 = __lsx_vpackev_b(hz_out7, hz_out6);
+ tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+ out7 = __lsx_vpackev_b(hz_out8, hz_out7);
+ tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+ DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1);
+ DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
+ __lsx_vstelm_d(vec0, dst, 0, 0);
+ __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
+ __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
+ dst += dst_stride_x4;
+ hz_out4 = hz_out8;
+ out0 = out2;
+ out1 = out6;
+ out3 = out5;
+ out4 = out7;
+ }
+}
+
+static void common_hv_4ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ common_hv_4ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ common_hv_4ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+ filter_horiz, filter_vert, height);
+}
+
+typedef void (*PVp8SixtapPredictFunc1)(
+ uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+ int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+ int32_t height);
+
+typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src,
+ int32_t src_stride,
+ uint8_t *RESTRICT dst,
+ int32_t dst_stride, const int8_t *filter,
+ int32_t height);
+
+void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ uint8_t *RESTRICT dst, int32_t dst_stride) {
+ const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+ const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+ static PVp8SixtapPredictFunc1 Predict4x4Funcs1[4] = {
+ common_hv_6ht_6vt_4w_lsx,
+ common_hv_6ht_4vt_4w_lsx,
+ common_hv_4ht_6vt_4w_lsx,
+ common_hv_4ht_4vt_4w_lsx,
+ };
+
+ static PVp8SixtapPredictFunc2 Predict4x4Funcs2[4] = { common_vt_6t_4w_lsx,
+ common_vt_4t_4w_lsx,
+ common_hz_6t_4w_lsx,
+ common_hz_4t_4w_lsx };
+ if (yoffset < 8 && xoffset < 8) {
+ if (yoffset) {
+ if (xoffset) {
+ switch (xoffset & 1) {
+ case 0:
+ switch (yoffset & 1) {
+ case 0:
+ Predict4x4Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
+ v_filter, 4);
+ break;
+ case 1:
+ Predict4x4Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
+ v_filter + 1, 4);
+ break;
+ }
+ break;
+
+ case 1:
+ switch (yoffset & 1) {
+ case 0:
+ Predict4x4Funcs1[2](src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter, 4);
+ break;
+
+ case 1:
+ Predict4x4Funcs1[3](src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter + 1, 4);
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (yoffset & 1) {
+ case 0:
+ Predict4x4Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 4);
+ break;
+
+ case 1:
+ Predict4x4Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
+ 4);
+ break;
+ }
+ }
+ } else {
+ switch (xoffset) {
+ case 0: {
+ __m128i tp0;
+ tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+ src += src_stride;
+ tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+ src += src_stride;
+ tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+ src += src_stride;
+ tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+
+ __lsx_vstelm_w(tp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(tp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(tp0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(tp0, dst, 0, 3);
+ break;
+ }
+ case 2:
+ case 4:
+ case 6:
+ Predict4x4Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 4);
+ break;
+ }
+ switch (xoffset & 1) {
+ case 1:
+ Predict4x4Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+ 4);
+ break;
+ }
+ }
+ }
+}
+
+void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ uint8_t *RESTRICT dst, int32_t dst_stride) {
+ const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+ const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+ static PVp8SixtapPredictFunc1 Predict8x8Funcs1[4] = {
+ common_hv_6ht_6vt_8w_lsx,
+ common_hv_6ht_4vt_8w_lsx,
+ common_hv_4ht_6vt_8w_lsx,
+ common_hv_4ht_4vt_8w_lsx,
+ };
+
+ static PVp8SixtapPredictFunc2 Predict8x8Funcs2[4] = { common_vt_6t_8w_lsx,
+ common_vt_4t_8w_lsx,
+ common_hz_6t_8w_lsx,
+ common_hz_4t_8w_lsx };
+
+ if (yoffset < 8 && xoffset < 8) {
+ if (yoffset) {
+ if (xoffset) {
+ switch (xoffset & 1) {
+ case 0:
+ switch (yoffset & 1) {
+ case 0:
+ Predict8x8Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
+ v_filter, 8);
+ break;
+
+ case 1:
+ Predict8x8Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
+ v_filter + 1, 8);
+ break;
+ }
+ break;
+
+ case 1:
+ switch (yoffset & 1) {
+ case 0:
+ Predict8x8Funcs1[2](src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter, 8);
+ break;
+
+ case 1:
+ Predict8x8Funcs1[3](src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter + 1, 8);
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (yoffset & 1) {
+ case 0:
+ Predict8x8Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 8);
+ break;
+
+ case 1:
+ Predict8x8Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
+ 8);
+ break;
+ }
+ }
+ } else {
+ switch (xoffset & 1) {
+ case 1:
+ Predict8x8Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+ 8);
+ break;
+ }
+ switch (xoffset) {
+ case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
+ case 2:
+ case 4:
+ case 6:
+ Predict8x8Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 8);
+ break;
+ }
+ }
+ }
+}
+
+void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ uint8_t *RESTRICT dst, int32_t dst_stride) {
+ const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+ const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+ static PVp8SixtapPredictFunc1 Predict16x16Funcs1[4] = {
+ common_hv_6ht_6vt_16w_lsx,
+ common_hv_6ht_4vt_16w_lsx,
+ common_hv_4ht_6vt_16w_lsx,
+ common_hv_4ht_4vt_16w_lsx,
+ };
+
+ static PVp8SixtapPredictFunc2 Predict16x16Funcs2[4] = {
+ common_vt_6t_16w_lsx, common_vt_4t_16w_lsx, common_hz_6t_16w_lsx,
+ common_hz_4t_16w_lsx
+ };
+
+ if (yoffset < 8 && xoffset < 8) {
+ if (yoffset) {
+ if (xoffset) {
+ switch (xoffset & 1) {
+ case 0:
+ switch (yoffset & 1) {
+ case 0:
+ Predict16x16Funcs1[0](src, src_stride, dst, dst_stride,
+ h_filter, v_filter, 16);
+ break;
+
+ case 1:
+ Predict16x16Funcs1[1](src, src_stride, dst, dst_stride,
+ h_filter, v_filter + 1, 16);
+ break;
+ }
+ break;
+
+ case 1:
+ switch (yoffset & 1) {
+ case 0:
+ Predict16x16Funcs1[2](src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter, 16);
+ break;
+
+ case 1:
+ Predict16x16Funcs1[3](src, src_stride, dst, dst_stride,
+ h_filter, v_filter + 1, 16);
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (yoffset & 1) {
+ case 0:
+ Predict16x16Funcs2[0](src, src_stride, dst, dst_stride, v_filter,
+ 16);
+ break;
+
+ case 1:
+ Predict16x16Funcs2[1](src, src_stride, dst, dst_stride,
+ v_filter + 1, 16);
+ break;
+ }
+ }
+ } else {
+ switch (xoffset & 1) {
+ case 1:
+ Predict16x16Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+ 16);
+ break;
+ }
+ switch (xoffset) {
+ case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
+ case 2:
+ case 4:
+ case 6:
+ Predict16x16Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 16);
+ break;
+ }
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/loopfilter.h b/media/libvpx/libvpx/vp8/common/loopfilter.h
new file mode 100644
index 0000000000..909e8df512
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/loopfilter.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_LOOPFILTER_H_
+#define VPX_VP8_COMMON_LOOPFILTER_H_
+
+#include "vpx_ports/mem.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LOOP_FILTER 63
+/* fraction of total macroblock rows to be used in fast filter level picking */
+/* has to be > 2 */
+#define PARTIAL_FRAME_FRACTION 8
+
+typedef enum { NORMAL_LOOPFILTER = 0, SIMPLE_LOOPFILTER = 1 } LOOPFILTERTYPE;
+
+#if VPX_ARCH_ARM
+#define SIMD_WIDTH 1
+#else
+#define SIMD_WIDTH 16
+#endif
+
+/* Need to align this structure so when it is declared and
+ * passed it can be loaded into vector registers.
+ */
+typedef struct {
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+ mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+ blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+ lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
+ unsigned char lvl[4][4][4];
+ unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
+ unsigned char mode_lf_lut[10];
+} loop_filter_info_n;
+
+typedef struct loop_filter_info {
+ const unsigned char *mblim;
+ const unsigned char *blim;
+ const unsigned char *lim;
+ const unsigned char *hev_thr;
+} loop_filter_info;
+
+typedef void loop_filter_uvfunction(unsigned char *u, /* source pointer */
+ int p, /* pitch */
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ unsigned char *v);
+
+/* assorted loopfilter functions which get used elsewhere */
+struct VP8Common;
+struct macroblockd;
+struct modeinfo;
+
+void vp8_loop_filter_init(struct VP8Common *cm);
+
+void vp8_loop_filter_frame_init(struct VP8Common *cm, struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd,
+ int frame_type);
+
+void vp8_loop_filter_partial_frame(struct VP8Common *cm,
+ struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp8_loop_filter_frame_yonly(struct VP8Common *cm, struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+ int sharpness_lvl);
+
+void vp8_loop_filter_row_normal(struct VP8Common *cm,
+ struct modeinfo *mode_info_context, int mb_row,
+ int post_ystride, int post_uvstride,
+ unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr);
+
+void vp8_loop_filter_row_simple(struct VP8Common *cm,
+ struct modeinfo *mode_info_context, int mb_row,
+ int post_ystride, unsigned char *y_ptr);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_LOOPFILTER_H_
diff --git a/media/libvpx/libvpx/vp8/common/loopfilter_filters.c b/media/libvpx/libvpx/vp8/common/loopfilter_filters.c
new file mode 100644
index 0000000000..61a55d3c92
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/loopfilter_filters.c
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef unsigned char uc;
+
+static signed char vp8_signed_char_clamp(int t) {
+ t = (t < -128 ? -128 : t);
+ t = (t > 127 ? 127 : t);
+ return (signed char)t;
+}
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static signed char vp8_filter_mask(uc limit, uc blimit, uc p3, uc p2, uc p1,
+ uc p0, uc q0, uc q1, uc q2, uc q3) {
+ signed char mask = 0;
+ mask |= (abs(p3 - p2) > limit);
+ mask |= (abs(p2 - p1) > limit);
+ mask |= (abs(p1 - p0) > limit);
+ mask |= (abs(q1 - q0) > limit);
+ mask |= (abs(q2 - q1) > limit);
+ mask |= (abs(q3 - q2) > limit);
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit);
+ return mask - 1;
+}
+
+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
+static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) {
+ signed char hev = 0;
+ hev |= (abs(p1 - p0) > thresh) * -1;
+ hev |= (abs(q1 - q0) > thresh) * -1;
+ return hev;
+}
+
+static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0,
+ uc *oq1) {
+ signed char ps0, qs0;
+ signed char ps1, qs1;
+ signed char filter_value, Filter1, Filter2;
+ signed char u;
+
+ ps1 = (signed char)*op1 ^ 0x80;
+ ps0 = (signed char)*op0 ^ 0x80;
+ qs0 = (signed char)*oq0 ^ 0x80;
+ qs1 = (signed char)*oq1 ^ 0x80;
+
+ /* add outer taps if we have high edge variance */
+ filter_value = vp8_signed_char_clamp(ps1 - qs1);
+ filter_value &= hev;
+
+ /* inner taps */
+ filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
+ filter_value &= mask;
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3
+ * if it equals 4 we'll set it to adjust by -1 to account for the fact
+ * we'd round it by 3 the other way
+ */
+ Filter1 = vp8_signed_char_clamp(filter_value + 4);
+ Filter2 = vp8_signed_char_clamp(filter_value + 3);
+ Filter1 >>= 3;
+ Filter2 >>= 3;
+ u = vp8_signed_char_clamp(qs0 - Filter1);
+ *oq0 = u ^ 0x80;
+ u = vp8_signed_char_clamp(ps0 + Filter2);
+ *op0 = u ^ 0x80;
+ filter_value = Filter1;
+
+ /* outer tap adjustments */
+ filter_value += 1;
+ filter_value >>= 1;
+ filter_value &= ~hev;
+
+ u = vp8_signed_char_clamp(qs1 - filter_value);
+ *oq1 = u ^ 0x80;
+ u = vp8_signed_char_clamp(ps1 + filter_value);
+ *op1 = u ^ 0x80;
+}
+
+static void loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
+ int hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do {
+ mask = vp8_filter_mask(limit[0], blimit[0], s[-4 * p], s[-3 * p], s[-2 * p],
+ s[-1 * p], s[0 * p], s[1 * p], s[2 * p], s[3 * p]);
+
+ hev = vp8_hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
+
+ vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+
+ ++s;
+ } while (++i < count * 8);
+}
+
+static void loop_filter_vertical_edge_c(unsigned char *s, int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
+ int hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do {
+ mask = vp8_filter_mask(limit[0], blimit[0], s[-4], s[-3], s[-2], s[-1],
+ s[0], s[1], s[2], s[3]);
+
+ hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+ vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
+
+ s += p;
+ } while (++i < count * 8);
+}
+
+static void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0,
+ uc *oq0, uc *oq1, uc *oq2) {
+ signed char s, u;
+ signed char filter_value, Filter1, Filter2;
+ signed char ps2 = (signed char)*op2 ^ 0x80;
+ signed char ps1 = (signed char)*op1 ^ 0x80;
+ signed char ps0 = (signed char)*op0 ^ 0x80;
+ signed char qs0 = (signed char)*oq0 ^ 0x80;
+ signed char qs1 = (signed char)*oq1 ^ 0x80;
+ signed char qs2 = (signed char)*oq2 ^ 0x80;
+
+ /* add outer taps if we have high edge variance */
+ filter_value = vp8_signed_char_clamp(ps1 - qs1);
+ filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
+ filter_value &= mask;
+
+ Filter2 = filter_value;
+ Filter2 &= hev;
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ Filter1 = vp8_signed_char_clamp(Filter2 + 4);
+ Filter2 = vp8_signed_char_clamp(Filter2 + 3);
+ Filter1 >>= 3;
+ Filter2 >>= 3;
+ qs0 = vp8_signed_char_clamp(qs0 - Filter1);
+ ps0 = vp8_signed_char_clamp(ps0 + Filter2);
+
+ /* only apply wider filter if not high edge variance */
+ filter_value &= ~hev;
+ Filter2 = filter_value;
+
+ /* roughly 3/7th difference across boundary */
+ u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
+ s = vp8_signed_char_clamp(qs0 - u);
+ *oq0 = s ^ 0x80;
+ s = vp8_signed_char_clamp(ps0 + u);
+ *op0 = s ^ 0x80;
+
+ /* roughly 2/7th difference across boundary */
+ u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
+ s = vp8_signed_char_clamp(qs1 - u);
+ *oq1 = s ^ 0x80;
+ s = vp8_signed_char_clamp(ps1 + u);
+ *op1 = s ^ 0x80;
+
+ /* roughly 1/7th difference across boundary */
+ u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
+ s = vp8_signed_char_clamp(qs2 - u);
+ *oq2 = s ^ 0x80;
+ s = vp8_signed_char_clamp(ps2 + u);
+ *op2 = s ^ 0x80;
+}
+
+static void mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
+ signed char hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do {
+ mask = vp8_filter_mask(limit[0], blimit[0], s[-4 * p], s[-3 * p], s[-2 * p],
+ s[-1 * p], s[0 * p], s[1 * p], s[2 * p], s[3 * p]);
+
+ hev = vp8_hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
+
+ vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+ s + 2 * p);
+
+ ++s;
+ } while (++i < count * 8);
+}
+
+static void mbloop_filter_vertical_edge_c(unsigned char *s, int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
+ signed char hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ do {
+ mask = vp8_filter_mask(limit[0], blimit[0], s[-4], s[-3], s[-2], s[-1],
+ s[0], s[1], s[2], s[3]);
+
+ hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+ vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+
+ s += p;
+ } while (++i < count * 8);
+}
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0,
+ uc q1) {
+ /* Why does this cause problems for win32?
+ * error C2143: syntax error : missing ';' before 'type'
+ * (void) limit;
+ */
+ signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1;
+ return mask;
+}
+
+static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0,
+ uc *oq1) {
+ signed char filter_value, Filter1, Filter2;
+ signed char p1 = (signed char)*op1 ^ 0x80;
+ signed char p0 = (signed char)*op0 ^ 0x80;
+ signed char q0 = (signed char)*oq0 ^ 0x80;
+ signed char q1 = (signed char)*oq1 ^ 0x80;
+ signed char u;
+
+ filter_value = vp8_signed_char_clamp(p1 - q1);
+ filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0));
+ filter_value &= mask;
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ Filter1 = vp8_signed_char_clamp(filter_value + 4);
+ Filter1 >>= 3;
+ u = vp8_signed_char_clamp(q0 - Filter1);
+ *oq0 = u ^ 0x80;
+
+ Filter2 = vp8_signed_char_clamp(filter_value + 3);
+ Filter2 >>= 3;
+ u = vp8_signed_char_clamp(p0 + Filter2);
+ *op0 = u ^ 0x80;
+}
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ signed char mask = 0;
+ int i = 0;
+
+ do {
+ mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2 * y_stride],
+ y_ptr[-1 * y_stride], y_ptr[0 * y_stride],
+ y_ptr[1 * y_stride]);
+ vp8_simple_filter(mask, y_ptr - 2 * y_stride, y_ptr - 1 * y_stride, y_ptr,
+ y_ptr + 1 * y_stride);
+ ++y_ptr;
+ } while (++i < 16);
+}
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ signed char mask = 0;
+ int i = 0;
+
+ do {
+ mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2], y_ptr[-1], y_ptr[0],
+ y_ptr[1]);
+ vp8_simple_filter(mask, y_ptr - 2, y_ptr - 1, y_ptr, y_ptr + 1);
+ y_ptr += y_stride;
+ } while (++i < 16);
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr) {
+ mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+ }
+
+ if (v_ptr) {
+ mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+ }
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr) {
+ mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+ }
+
+ if (v_ptr) {
+ mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+ }
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr) {
+ loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 1);
+ }
+
+ if (v_ptr) {
+ loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 1);
+ }
+}
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,
+ blimit);
+ vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride,
+ blimit);
+ vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride,
+ blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr) {
+ loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
+ }
+
+ if (v_ptr) {
+ loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
+ }
+}
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
+}
diff --git a/media/libvpx/libvpx/vp8/common/mbpitch.c b/media/libvpx/libvpx/vp8/common/mbpitch.c
new file mode 100644
index 0000000000..188b57f389
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mbpitch.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "blockd.h"
+
+void vp8_setup_block_dptrs(MACROBLOCKD *x) {
+ int r, c;
+
+ for (r = 0; r < 4; ++r) {
+ for (c = 0; c < 4; ++c) {
+ x->block[r * 4 + c].predictor = x->predictor + r * 4 * 16 + c * 4;
+ }
+ }
+
+ for (r = 0; r < 2; ++r) {
+ for (c = 0; c < 2; ++c) {
+ x->block[16 + r * 2 + c].predictor =
+ x->predictor + 256 + r * 4 * 8 + c * 4;
+ }
+ }
+
+ for (r = 0; r < 2; ++r) {
+ for (c = 0; c < 2; ++c) {
+ x->block[20 + r * 2 + c].predictor =
+ x->predictor + 320 + r * 4 * 8 + c * 4;
+ }
+ }
+
+ for (r = 0; r < 25; ++r) {
+ x->block[r].qcoeff = x->qcoeff + r * 16;
+ x->block[r].dqcoeff = x->dqcoeff + r * 16;
+ x->block[r].eob = x->eobs + r;
+ }
+}
+
+void vp8_build_block_doffsets(MACROBLOCKD *x) {
+ int block;
+
+ for (block = 0; block < 16; ++block) /* y blocks */
+ {
+ x->block[block].offset =
+ (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4;
+ }
+
+ for (block = 16; block < 20; ++block) /* U and V blocks */
+ {
+ x->block[block + 4].offset = x->block[block].offset =
+ ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/mfqe.c b/media/libvpx/libvpx/vp8/common/mfqe.c
new file mode 100644
index 0000000000..1fe7363f17
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mfqe.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* MFQE: Multiframe Quality Enhancement
+ * In rate limited situations keyframes may cause significant visual artifacts
+ * commonly referred to as "popping." This file implements a postproccesing
+ * algorithm which blends data from the preceeding frame when there is no
+ * motion and the q from the previous frame is lower which indicates that it is
+ * higher quality.
+ */
+
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/common.h"
+#include "vp8/common/postproc.h"
+#include "vpx_dsp/variance.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+
+static void filter_by_weight(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride, int block_size,
+ int src_weight) {
+ int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ int rounding_bit = 1 << (MFQE_PRECISION - 1);
+ int r, c;
+
+ for (r = 0; r < block_size; ++r) {
+ for (c = 0; c < block_size; ++c) {
+ dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit) >>
+ MFQE_PRECISION;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int src_weight) {
+ filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
+void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int src_weight) {
+ filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int src_weight) {
+ filter_by_weight(src, src_stride, dst, dst_stride, 4, src_weight);
+}
+
+static void apply_ifactor(unsigned char *y_src, int y_src_stride,
+ unsigned char *y_dst, int y_dst_stride,
+ unsigned char *u_src, unsigned char *v_src,
+ int uv_src_stride, unsigned char *u_dst,
+ unsigned char *v_dst, int uv_dst_stride,
+ int block_size, int src_weight) {
+ if (block_size == 16) {
+ vp8_filter_by_weight16x16(y_src, y_src_stride, y_dst, y_dst_stride,
+ src_weight);
+ vp8_filter_by_weight8x8(u_src, uv_src_stride, u_dst, uv_dst_stride,
+ src_weight);
+ vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride,
+ src_weight);
+ } else {
+ vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride,
+ src_weight);
+ vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride,
+ src_weight);
+ vp8_filter_by_weight4x4(v_src, uv_src_stride, v_dst, uv_dst_stride,
+ src_weight);
+ }
+}
+
+static unsigned int int_sqrt(unsigned int x) {
+ unsigned int y = x;
+ unsigned int guess;
+ int p = 1;
+ while (y >>= 1) p++;
+ p >>= 1;
+
+ guess = 0;
+ while (p >= 0) {
+ guess |= (1 << p);
+ if (x < guess * guess) guess -= (1 << p);
+ p--;
+ }
+ /* choose between guess or guess+1 */
+ return guess + (guess * guess + guess + 1 <= x);
+}
+
+#define USE_SSD
+static void multiframe_quality_enhance_block(
+ int blksize, /* Currently only values supported are 16, 8 */
+ int qcurr, int qprev, unsigned char *y, unsigned char *u, unsigned char *v,
+ int y_stride, int uv_stride, unsigned char *yd, unsigned char *ud,
+ unsigned char *vd, int yd_stride, int uvd_stride) {
+ static const unsigned char VP8_ZEROS[16] = { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+ int uvblksize = blksize >> 1;
+ int qdiff = qcurr - qprev;
+
+ int i;
+ unsigned char *up;
+ unsigned char *udp;
+ unsigned char *vp;
+ unsigned char *vdp;
+
+ unsigned int act, actd, sad, usad, vsad, sse, thr, thrsq, actrisk;
+
+ if (blksize == 16) {
+ actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse) + 128) >> 8;
+ act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse) + 128) >> 8;
+#ifdef USE_SSD
+ vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
+ sad = (sse + 128) >> 8;
+ vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
+ usad = (sse + 32) >> 6;
+ vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
+ vsad = (sse + 32) >> 6;
+#else
+ sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+ usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
+ vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride) + 32) >> 6;
+#endif
+ } else {
+ actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse) + 32) >> 6;
+ act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse) + 32) >> 6;
+#ifdef USE_SSD
+ vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
+ sad = (sse + 32) >> 6;
+ vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
+ usad = (sse + 8) >> 4;
+ vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
+ vsad = (sse + 8) >> 4;
+#else
+ sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
+ usad = (vpx_sad4x4(u, uv_stride, ud, uvd_stride) + 8) >> 4;
+ vsad = (vpx_sad4x4(v, uv_stride, vd, uvd_stride) + 8) >> 4;
+#endif
+ }
+
+ actrisk = (actd > act * 5);
+
+ /* thr = qdiff/16 + log2(act) + log4(qprev) */
+ thr = (qdiff >> 4);
+ while (actd >>= 1) thr++;
+ while (qprev >>= 2) thr++;
+
+#ifdef USE_SSD
+ thrsq = thr * thr;
+ if (sad < thrsq &&
+ /* additional checks for color mismatch and excessive addition of
+ * high-frequencies */
+ 4 * usad < thrsq && 4 * vsad < thrsq && !actrisk)
+#else
+ if (sad < thr &&
+ /* additional checks for color mismatch and excessive addition of
+ * high-frequencies */
+ 2 * usad < thr && 2 * vsad < thr && !actrisk)
+#endif
+ {
+ int ifactor;
+#ifdef USE_SSD
+ /* TODO: optimize this later to not need sqr root */
+ sad = int_sqrt(sad);
+#endif
+ ifactor = (sad << MFQE_PRECISION) / thr;
+ ifactor >>= (qdiff >> 5);
+
+ if (ifactor) {
+ apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
+ uvd_stride, blksize, ifactor);
+ }
+ } else { /* else implicitly copy from previous frame */
+ if (blksize == 16) {
+ vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
+ vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
+ vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
+ } else {
+ vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
+ for (up = u, udp = ud, i = 0; i < uvblksize;
+ ++i, up += uv_stride, udp += uvd_stride) {
+ memcpy(udp, up, uvblksize);
+ }
+ for (vp = v, vdp = vd, i = 0; i < uvblksize;
+ ++i, vp += uv_stride, vdp += uvd_stride) {
+ memcpy(vdp, vp, uvblksize);
+ }
+ }
+ }
+}
+
+static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map) {
+ if (mode_info_context->mbmi.mb_skip_coeff) {
+ map[0] = map[1] = map[2] = map[3] = 1;
+ } else if (mode_info_context->mbmi.mode == SPLITMV) {
+ static int ndx[4][4] = {
+ { 0, 1, 4, 5 }, { 2, 3, 6, 7 }, { 8, 9, 12, 13 }, { 10, 11, 14, 15 }
+ };
+ int i, j;
+ vp8_zero(*map);
+ for (i = 0; i < 4; ++i) {
+ map[i] = 1;
+ for (j = 0; j < 4 && map[j]; ++j) {
+ map[i] &= (mode_info_context->bmi[ndx[i][j]].mv.as_mv.row <= 2 &&
+ mode_info_context->bmi[ndx[i][j]].mv.as_mv.col <= 2);
+ }
+ }
+ } else {
+ map[0] = map[1] = map[2] = map[3] =
+ (mode_info_context->mbmi.mode > B_PRED &&
+ abs(mode_info_context->mbmi.mv.as_mv.row) <= 2 &&
+ abs(mode_info_context->mbmi.mv.as_mv.col) <= 2);
+ }
+ return (map[0] + map[1] + map[2] + map[3]);
+}
+
+void vp8_multiframe_quality_enhance(VP8_COMMON *cm) {
+ YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+ YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+ FRAME_TYPE frame_type = cm->frame_type;
+ /* Point at base of Mb MODE_INFO list has motion vectors etc */
+ const MODE_INFO *mode_info_context = cm->mi;
+ int mb_row;
+ int mb_col;
+ int totmap, map[4];
+ int qcurr = cm->base_qindex;
+ int qprev = cm->postproc_state.last_base_qindex;
+
+ unsigned char *y_ptr, *u_ptr, *v_ptr;
+ unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
+
+ /* Set up the buffer pointers */
+ y_ptr = show->y_buffer;
+ u_ptr = show->u_buffer;
+ v_ptr = show->v_buffer;
+ yd_ptr = dest->y_buffer;
+ ud_ptr = dest->u_buffer;
+ vd_ptr = dest->v_buffer;
+
+ /* postprocess each macro block */
+ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+ /* if motion is high there will likely be no benefit */
+ if (frame_type == INTER_FRAME) {
+ totmap = qualify_inter_mb(mode_info_context, map);
+ } else {
+ totmap = (frame_type == KEY_FRAME ? 4 : 0);
+ }
+ if (totmap) {
+ if (totmap < 4) {
+ int i, j;
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < 2; ++j) {
+ if (map[i * 2 + j]) {
+ multiframe_quality_enhance_block(
+ 8, qcurr, qprev, y_ptr + 8 * (i * show->y_stride + j),
+ u_ptr + 4 * (i * show->uv_stride + j),
+ v_ptr + 4 * (i * show->uv_stride + j), show->y_stride,
+ show->uv_stride, yd_ptr + 8 * (i * dest->y_stride + j),
+ ud_ptr + 4 * (i * dest->uv_stride + j),
+ vd_ptr + 4 * (i * dest->uv_stride + j), dest->y_stride,
+ dest->uv_stride);
+ } else {
+ /* copy a 8x8 block */
+ int k;
+ unsigned char *up = u_ptr + 4 * (i * show->uv_stride + j);
+ unsigned char *udp = ud_ptr + 4 * (i * dest->uv_stride + j);
+ unsigned char *vp = v_ptr + 4 * (i * show->uv_stride + j);
+ unsigned char *vdp = vd_ptr + 4 * (i * dest->uv_stride + j);
+ vp8_copy_mem8x8(
+ y_ptr + 8 * (i * show->y_stride + j), show->y_stride,
+ yd_ptr + 8 * (i * dest->y_stride + j), dest->y_stride);
+ for (k = 0; k < 4; ++k, up += show->uv_stride,
+ udp += dest->uv_stride, vp += show->uv_stride,
+ vdp += dest->uv_stride) {
+ memcpy(udp, up, 4);
+ memcpy(vdp, vp, 4);
+ }
+ }
+ }
+ }
+ } else { /* totmap = 4 */
+ multiframe_quality_enhance_block(
+ 16, qcurr, qprev, y_ptr, u_ptr, v_ptr, show->y_stride,
+ show->uv_stride, yd_ptr, ud_ptr, vd_ptr, dest->y_stride,
+ dest->uv_stride);
+ }
+ } else {
+ vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
+ vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
+ vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
+ }
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+ yd_ptr += 16;
+ ud_ptr += 8;
+ vd_ptr += 8;
+ mode_info_context++; /* step to next MB */
+ }
+
+ y_ptr += show->y_stride * 16 - 16 * cm->mb_cols;
+ u_ptr += show->uv_stride * 8 - 8 * cm->mb_cols;
+ v_ptr += show->uv_stride * 8 - 8 * cm->mb_cols;
+ yd_ptr += dest->y_stride * 16 - 16 * cm->mb_cols;
+ ud_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols;
+ vd_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols;
+
+ mode_info_context++; /* Skip border mb */
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
new file mode 100644
index 0000000000..1cfd146189
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vp8_dequant_idct_add_dspr2(short *input, short *dq, unsigned char *dest,
+ int stride) {
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ input[i] = dq[i] * input[i];
+ }
+
+ vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride);
+
+ memset(input, 0, 32);
+}
+
+#endif
diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
new file mode 100644
index 0000000000..b9da52084d
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
@@ -0,0 +1,2767 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 256
+unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
+
+static const unsigned short sub_pel_filterss[8][3] = {
+ { 0, 0, 0 },
+ { 0, 0x0601, 0x7b0c },
+ { 0x0201, 0x0b08, 0x6c24 },
+ { 0, 0x0906, 0x5d32 },
+ { 0x0303, 0x1010, 0x4d4d },
+ { 0, 0x0609, 0x325d },
+ { 0x0102, 0x080b, 0x246c },
+ { 0, 0x0106, 0x0c7b },
+};
+
+static const int sub_pel_filters_int[8][3] = {
+ { 0, 0, 0 },
+ { 0x0000fffa, 0x007b000c, 0xffff0000 },
+ { 0x0002fff5, 0x006c0024, 0xfff80001 },
+ { 0x0000fff7, 0x005d0032, 0xfffa0000 },
+ { 0x0003fff0, 0x004d004d, 0xfff00003 },
+ { 0x0000fffa, 0x0032005d, 0xfff70000 },
+ { 0x0001fff8, 0x0024006c, 0xfff50002 },
+ { 0x0000ffff, 0x000c007b, 0xfffa0000 },
+};
+
+static const int sub_pel_filters_inv[8][3] = {
+ { 0, 0, 0 },
+ { 0xfffa0000, 0x000c007b, 0x0000ffff },
+ { 0xfff50002, 0x0024006c, 0x0001fff8 },
+ { 0xfff70000, 0x0032005d, 0x0000fffa },
+ { 0xfff00003, 0x004d004d, 0x0003fff0 },
+ { 0xfffa0000, 0x005d0032, 0x0000fff7 },
+ { 0xfff80001, 0x006c0024, 0x0002fff5 },
+ { 0xffff0000, 0x007b000c, 0x0000fffa },
+};
+
+/* clang-format off */
+static const int sub_pel_filters_int_tap_4[8][2] = {
+ { 0, 0},
+ { 0xfffa007b, 0x000cffff},
+ { 0, 0},
+ { 0xfff7005d, 0x0032fffa},
+ { 0, 0},
+ { 0xfffa0032, 0x005dfff7},
+ { 0, 0},
+ { 0xffff000c, 0x007bfffa},
+};
+
+
+static const int sub_pel_filters_inv_tap_4[8][2] = {
+ { 0, 0},
+ { 0x007bfffa, 0xffff000c},
+ { 0, 0},
+ { 0x005dfff7, 0xfffa0032},
+ { 0, 0},
+ { 0x0032fffa, 0xfff7005d},
+ { 0, 0},
+ { 0x000cffff, 0xfffa007b},
+};
+/* clang-format on */
+
+inline void prefetch_load(unsigned char *src) {
+ __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src));
+}
+
+inline void prefetch_store(unsigned char *dst) {
+ __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst));
+}
+
+void dsputil_static_init(void) {
+ int i;
+
+ for (i = 0; i < 256; ++i) ff_cropTbl[i + CROP_WIDTH] = i;
+
+ for (i = 0; i < CROP_WIDTH; ++i) {
+ ff_cropTbl[i] = 0;
+ ff_cropTbl[i + CROP_WIDTH + 256] = 255;
+ }
+}
+
+void vp8_filter_block2d_first_pass_4(unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT dst_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height, int xoffset,
+ int pitch) {
+ unsigned int i;
+ int Temp1, Temp2, Temp3, Temp4;
+
+ unsigned int vector4a = 64;
+ int vector1b, vector2b, vector3b;
+ unsigned int tp1, tp2, tn1, tn2;
+ unsigned int p1, p2, p3;
+ unsigned int n1, n2, n3;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ vector3b = sub_pel_filters_inv[xoffset][2];
+
+ /* if (xoffset == 0) we don't need any filtering */
+ if (vector3b == 0) {
+ for (i = 0; i < output_height; ++i) {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+ dst_ptr[0] = src_ptr[0];
+ dst_ptr[1] = src_ptr[1];
+ dst_ptr[2] = src_ptr[2];
+ dst_ptr[3] = src_ptr[3];
+
+ /* next row... */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += 4;
+ }
+ } else {
+ if (vector3b > 65536) {
+ /* 6 tap filter */
+
+ vector1b = sub_pel_filters_inv[xoffset][0];
+ vector2b = sub_pel_filters_inv[xoffset][1];
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+
+ for (i = output_height; i--;) {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "ulw %[tp1], -2(%[src_ptr]) \n\t"
+ "ulw %[tp2], 2(%[src_ptr]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p1], %[tp2] \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+
+ /* odd 1. pixel */
+ "ulw %[tn2], 3(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ /* clamp */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "lbux %[n2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "sb %[tn1], 1(%[dst_ptr]) \n\t"
+ "sb %[tp2], 2(%[dst_ptr]) \n\t"
+ "sb %[n2], 3(%[dst_ptr]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
+ [vector3b] "r"(vector3b), [src_ptr] "r"(src_ptr));
+
+ /* Next row... */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += pitch;
+ }
+ } else {
+ /* 4 tap filter */
+
+ vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+ vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+ for (i = output_height; i--;) {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "ulw %[tp1], -1(%[src_ptr]) \n\t"
+ "ulw %[tp2], 3(%[src_ptr]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 1. pixel */
+ "srl %[tn1], %[tp2], 8 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn1] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ /* clamp and store results */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "sb %[tn1], 1(%[dst_ptr]) \n\t"
+ "lbux %[n2], %[Temp4](%[cm]) \n\t"
+ "sb %[tp2], 2(%[dst_ptr]) \n\t"
+ "sb %[n2], 3(%[dst_ptr]) \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
+ [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
+ [src_ptr] "r"(src_ptr));
+ /* Next row... */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += pitch;
+ }
+ }
+ }
+}
+
+void vp8_filter_block2d_first_pass_8_all(unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT dst_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ int xoffset, int pitch) {
+ unsigned int i;
+ int Temp1, Temp2, Temp3, Temp4;
+
+ unsigned int vector4a = 64;
+ unsigned int vector1b, vector2b, vector3b;
+ unsigned int tp1, tp2, tn1, tn2;
+ unsigned int p1, p2, p3, p4;
+ unsigned int n1, n2, n3, n4;
+
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ /* if (xoffset == 0) we don't need any filtering */
+ if (xoffset == 0) {
+ for (i = 0; i < output_height; ++i) {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+
+ dst_ptr[0] = src_ptr[0];
+ dst_ptr[1] = src_ptr[1];
+ dst_ptr[2] = src_ptr[2];
+ dst_ptr[3] = src_ptr[3];
+ dst_ptr[4] = src_ptr[4];
+ dst_ptr[5] = src_ptr[5];
+ dst_ptr[6] = src_ptr[6];
+ dst_ptr[7] = src_ptr[7];
+
+ /* next row... */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += 8;
+ }
+ } else {
+ vector3b = sub_pel_filters_inv[xoffset][2];
+
+ if (vector3b > 65536) {
+ /* 6 tap filter */
+
+ vector1b = sub_pel_filters_inv[xoffset][0];
+ vector2b = sub_pel_filters_inv[xoffset][1];
+
+ for (i = output_height; i--;) {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "ulw %[tp1], -2(%[src_ptr]) \n\t"
+ "ulw %[tp2], 2(%[src_ptr]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p1], %[tp2] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+ "ulw %[tn2], 3(%[src_ptr]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "ulw %[tp1], 6(%[src_ptr]) \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[tp1] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
+ [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector4a] "r"(vector4a), [vector3b] "r"(vector3b),
+ [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ dst_ptr[0] = cm[Temp1];
+ dst_ptr[1] = cm[Temp2];
+ dst_ptr[2] = cm[Temp3];
+ dst_ptr[3] = cm[Temp4];
+
+ /* next 4 pixels */
+ __asm__ __volatile__(
+ /* even 3. pixel */
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p4], %[tp1] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+
+ "ulw %[tn1], 7(%[src_ptr]) \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 3. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n2], %[tn1] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ /* odd 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n4], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tn1] "=&r"(tn1), [n2] "=&r"(n2), [p4] "=&r"(p4), [n4] "=&r"(n4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=r"(Temp4)
+ : [tp1] "r"(tp1), [vector1b] "r"(vector1b), [p2] "r"(p2),
+ [vector2b] "r"(vector2b), [n1] "r"(n1), [p1] "r"(p1),
+ [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), [p3] "r"(p3),
+ [n3] "r"(n3), [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ dst_ptr[4] = cm[Temp1];
+ dst_ptr[5] = cm[Temp2];
+ dst_ptr[6] = cm[Temp3];
+ dst_ptr[7] = cm[Temp4];
+
+ src_ptr += src_pixels_per_line;
+ dst_ptr += pitch;
+ }
+ } else {
+ /* 4 tap filter */
+
+ vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+ vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+ for (i = output_height; i--;) {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "ulw %[tp1], -1(%[src_ptr]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+
+ "ulw %[tp2], 3(%[src_ptr]) \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ "ulw %[tn2], 4(%[src_ptr]) \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "preceu.ph.qbl %[n4], %[tn2] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "ulw %[tp1], 7(%[src_ptr]) \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+ [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), [n4] "=&r"(n4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ dst_ptr[0] = cm[Temp1];
+ dst_ptr[1] = cm[Temp2];
+ dst_ptr[2] = cm[Temp3];
+ dst_ptr[3] = cm[Temp4];
+
+ /* next 4 pixels */
+ __asm__ __volatile__(
+ /* even 3. pixel */
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbr %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 3. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t"
+ "ulw %[tn1], 8(%[src_ptr]) \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ /* odd 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbr %[n2], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=r"(Temp4)
+ : [tp1] "r"(tp1), [p3] "r"(p3), [p4] "r"(p4),
+ [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr), [n3] "r"(n3),
+ [n4] "r"(n4));
+
+ /* clamp and store results */
+ dst_ptr[4] = cm[Temp1];
+ dst_ptr[5] = cm[Temp2];
+ dst_ptr[6] = cm[Temp3];
+ dst_ptr[7] = cm[Temp4];
+
+ /* next row... */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += pitch;
+ }
+ }
+ }
+}
+
+void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT dst_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ int xoffset, int pitch) {
+ unsigned int i;
+ int Temp1, Temp2, Temp3, Temp4;
+
+ unsigned int vector4a;
+ unsigned int vector1b, vector2b, vector3b;
+ unsigned int tp1, tp2, tn1, tn2;
+ unsigned int p1, p2, p3, p4;
+ unsigned int n1, n2, n3, n4;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ vector1b = sub_pel_filters_inv[xoffset][0];
+ vector2b = sub_pel_filters_inv[xoffset][1];
+ vector3b = sub_pel_filters_inv[xoffset][2];
+ vector4a = 64;
+
+ for (i = output_height; i--;) {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "ulw %[tp1], -2(%[src_ptr]) \n\t"
+ "ulw %[tp2], 2(%[src_ptr]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p1], %[tp2] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "ulw %[tn2], 3(%[src_ptr]) \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "ulw %[tp1], 6(%[src_ptr]) \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[tp1] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2), [p1] "=&r"(p1),
+ [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), [n2] "=&r"(n2),
+ [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector4a] "r"(vector4a), [vector3b] "r"(vector3b),
+ [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ dst_ptr[0] = cm[Temp1];
+ dst_ptr[1] = cm[Temp2];
+ dst_ptr[2] = cm[Temp3];
+ dst_ptr[3] = cm[Temp4];
+
+ /* next 4 pixels */
+ __asm__ __volatile__(
+ /* even 3. pixel */
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p4], %[tp1] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "ulw %[tn1], 7(%[src_ptr]) \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 3. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n2], %[tn1] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ /* odd 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n4], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
+ "ulw %[tp2], 10(%[src_ptr]) \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4),
+ [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p1] "+r"(p1)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
+ [n1] "r"(n1), [vector4a] "r"(vector4a), [p2] "r"(p2),
+ [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
+ [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ dst_ptr[4] = cm[Temp1];
+ dst_ptr[5] = cm[Temp2];
+ dst_ptr[6] = cm[Temp3];
+ dst_ptr[7] = cm[Temp4];
+
+ /* next 4 pixels */
+ __asm__ __volatile__(
+ /* even 5. pixel */
+ "dpa.w.ph $ac3, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
+
+ /* even 6. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector3b] \n\t"
+
+ "ulw %[tn1], 11(%[src_ptr]) \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 5. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tn1] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ /* odd 6. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n3], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector3b] \n\t"
+ "ulw %[tp1], 14(%[src_ptr]) \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p4], %[tp1] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3),
+ [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+ [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p4] "+r"(p4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2),
+ [p2] "r"(p2), [n2] "r"(n2), [n4] "r"(n4), [p1] "r"(p1),
+ [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a),
+ [vector3b] "r"(vector3b));
+
+ /* clamp and store results */
+ dst_ptr[8] = cm[Temp1];
+ dst_ptr[9] = cm[Temp2];
+ dst_ptr[10] = cm[Temp3];
+ dst_ptr[11] = cm[Temp4];
+
+ /* next 4 pixels */
+ __asm__ __volatile__(
+ /* even 7. pixel */
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector3b] \n\t"
+
+ /* even 8. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector3b] \n\t"
+ "ulw %[tn1], 15(%[src_ptr]) \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 7. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n4], %[tn1] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector3b] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ /* odd 8. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n2], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector3b] \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ /* clamp and store results */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[p2], %[Temp3](%[cm]) \n\t"
+ "sb %[tp1], 12(%[dst_ptr]) \n\t"
+ "sb %[tn1], 13(%[dst_ptr]) \n\t"
+ "lbux %[n2], %[Temp4](%[cm]) \n\t"
+ "sb %[p2], 14(%[dst_ptr]) \n\t"
+ "sb %[n2], 15(%[dst_ptr]) \n\t"
+
+ : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4),
+ [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=r"(Temp4), [tp1] "+r"(tp1)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [p4] "r"(p4),
+ [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
+ [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
+ [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+
+ src_ptr += src_pixels_per_line;
+ dst_ptr += pitch;
+ }
+}
+
+void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT output_ptr,
+ unsigned int src_pixels_per_line) {
+ int Temp1, Temp2, Temp3, Temp4;
+ int i;
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_store(output_ptr + 32);
+
+ /* copy memory from src buffer to dst buffer */
+ for (i = 0; i < 7; ++i) {
+ __asm__ __volatile__(
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "ulw %[Temp3], 8(%[src_ptr]) \n\t"
+ "ulw %[Temp4], 12(%[src_ptr]) \n\t"
+ "sw %[Temp1], 0(%[output_ptr]) \n\t"
+ "sw %[Temp2], 4(%[output_ptr]) \n\t"
+ "sw %[Temp3], 8(%[output_ptr]) \n\t"
+ "sw %[Temp4], 12(%[output_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
+ : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+ output_ptr));
+
+ __asm__ __volatile__(
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "ulw %[Temp3], 8(%[src_ptr]) \n\t"
+ "ulw %[Temp4], 12(%[src_ptr]) \n\t"
+ "sw %[Temp1], 16(%[output_ptr]) \n\t"
+ "sw %[Temp2], 20(%[output_ptr]) \n\t"
+ "sw %[Temp3], 24(%[output_ptr]) \n\t"
+ "sw %[Temp4], 28(%[output_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
+ : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+ output_ptr));
+
+ __asm__ __volatile__(
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "ulw %[Temp3], 8(%[src_ptr]) \n\t"
+ "ulw %[Temp4], 12(%[src_ptr]) \n\t"
+ "sw %[Temp1], 32(%[output_ptr]) \n\t"
+ "sw %[Temp2], 36(%[output_ptr]) \n\t"
+ "sw %[Temp3], 40(%[output_ptr]) \n\t"
+ "sw %[Temp4], 44(%[output_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
+ : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+ output_ptr));
+
+ output_ptr += 48;
+ }
+}
+
+void vp8_filter_block2d_first_pass16_4tap(
+ unsigned char *RESTRICT src_ptr, unsigned char *RESTRICT output_ptr,
+ unsigned int src_pixels_per_line, unsigned int output_width,
+ unsigned int output_height, int xoffset, int yoffset,
+ unsigned char *RESTRICT dst_ptr, int pitch) {
+ unsigned int i, j;
+ int Temp1, Temp2, Temp3, Temp4;
+
+ unsigned int vector4a;
+ int vector1b, vector2b;
+ unsigned int tp1, tp2, tp3, tn1;
+ unsigned int p1, p2, p3;
+ unsigned int n1, n2, n3;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ vector4a = 64;
+
+ vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+ vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+ /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
+ if (yoffset == 0) {
+ output_height -= 5;
+ src_ptr += (src_pixels_per_line + src_pixels_per_line);
+
+ for (i = output_height; i--;) {
+ __asm__ __volatile__("ulw %[tp3], -1(%[src_ptr]) \n\t"
+ : [tp3] "=&r"(tp3)
+ : [src_ptr] "r"(src_ptr));
+
+ /* processing 4 adjacent pixels */
+ for (j = 0; j < 16; j += 4) {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "ulw %[tp2], 3(%[src_ptr]) "
+ "\n\t"
+ "move %[tp1], %[tp3] "
+ "\n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 "
+ "\n\t"
+ "mthi $0, $ac3 "
+ "\n\t"
+ "move %[tp3], %[tp2] "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[tp1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[tp1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[tp2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] "
+ "\n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 "
+ "\n\t"
+ "mthi $0, $ac2 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] "
+ "\n\t"
+ "extr.w %[Temp1], $ac3, 7 "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "ulw %[tn1], 4(%[src_ptr]) "
+ "\n\t"
+ "balign %[tp2], %[tp1], 3 "
+ "\n\t"
+ "mtlo %[vector4a], $ac3 "
+ "\n\t"
+ "mthi $0, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[n1], %[tp2] "
+ "\n\t"
+ "preceu.ph.qbl %[n2], %[tp2] "
+ "\n\t"
+ "preceu.ph.qbr %[n3], %[tn1] "
+ "\n\t"
+ "extr.w %[Temp3], $ac2, 7 "
+ "\n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] "
+ "\n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 "
+ "\n\t"
+ "mthi $0, $ac2 "
+ "\n\t"
+ "extr.w %[Temp2], $ac3, 7 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] "
+ "\n\t"
+ "extr.w %[Temp4], $ac2, 7 "
+ "\n\t"
+
+ /* clamp and store results */
+ "lbux %[tp1], %[Temp1](%[cm]) "
+ "\n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) "
+ "\n\t"
+ "lbux %[tp2], %[Temp3](%[cm]) "
+ "\n\t"
+ "sb %[tp1], 0(%[dst_ptr]) "
+ "\n\t"
+ "sb %[tn1], 1(%[dst_ptr]) "
+ "\n\t"
+ "lbux %[n2], %[Temp4](%[cm]) "
+ "\n\t"
+ "sb %[tp2], 2(%[dst_ptr]) "
+ "\n\t"
+ "sb %[n2], 3(%[dst_ptr]) "
+ "\n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tn1] "=&r"(tn1), [p1] "=&r"(p1), [p2] "=&r"(p2), [n1] "=&r"(n1),
+ [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [p3] "=&r"(p3), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
+ [src_ptr] "r"(src_ptr));
+
+ src_ptr += 4;
+ }
+
+ /* Next row... */
+ src_ptr += src_pixels_per_line - 16;
+ dst_ptr += pitch;
+ }
+ } else {
+ for (i = output_height; i--;) {
+ /* processing 4 adjacent pixels */
+ for (j = 0; j < 16; j += 4) {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "ulw %[tp1], -1(%[src_ptr]) "
+ "\n\t"
+ "ulw %[tp2], 3(%[src_ptr]) "
+ "\n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 "
+ "\n\t"
+ "mthi $0, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[p1], %[tp1] "
+ "\n\t"
+ "preceu.ph.qbl %[p2], %[tp1] "
+ "\n\t"
+ "preceu.ph.qbr %[p3], %[tp2] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] "
+ "\n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 "
+ "\n\t"
+ "mthi $0, $ac2 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] "
+ "\n\t"
+ "extr.w %[Temp1], $ac3, 7 "
+ "\n\t"
+
+ /* odd 1. pixel */
+ "ulw %[tn1], 4(%[src_ptr]) "
+ "\n\t"
+ "balign %[tp2], %[tp1], 3 "
+ "\n\t"
+ "mtlo %[vector4a], $ac3 "
+ "\n\t"
+ "mthi $0, $ac3 "
+ "\n\t"
+ "preceu.ph.qbr %[n1], %[tp2] "
+ "\n\t"
+ "preceu.ph.qbl %[n2], %[tp2] "
+ "\n\t"
+ "preceu.ph.qbr %[n3], %[tn1] "
+ "\n\t"
+ "extr.w %[Temp3], $ac2, 7 "
+ "\n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] "
+ "\n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] "
+ "\n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 "
+ "\n\t"
+ "mthi $0, $ac2 "
+ "\n\t"
+ "extr.w %[Temp2], $ac3, 7 "
+ "\n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] "
+ "\n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] "
+ "\n\t"
+ "extr.w %[Temp4], $ac2, 7 "
+ "\n\t"
+
+ /* clamp and store results */
+ "lbux %[tp1], %[Temp1](%[cm]) "
+ "\n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) "
+ "\n\t"
+ "lbux %[tp2], %[Temp3](%[cm]) "
+ "\n\t"
+ "sb %[tp1], 0(%[output_ptr]) "
+ "\n\t"
+ "sb %[tn1], 1(%[output_ptr]) "
+ "\n\t"
+ "lbux %[n2], %[Temp4](%[cm]) "
+ "\n\t"
+ "sb %[tp2], 2(%[output_ptr]) "
+ "\n\t"
+ "sb %[n2], 3(%[output_ptr]) "
+ "\n\t"
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+ [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
+ [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
+ [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector4a] "r"(vector4a), [cm] "r"(cm),
+ [output_ptr] "r"(output_ptr), [src_ptr] "r"(src_ptr));
+
+ src_ptr += 4;
+ }
+
+ /* next row... */
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+ }
+}
+
+void vp8_filter_block2d_second_pass4(unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT output_ptr,
+ int output_pitch, int yoffset) {
+ unsigned int i;
+
+ int Temp1, Temp2, Temp3, Temp4;
+ unsigned int vector1b, vector2b, vector3b, vector4a;
+
+ unsigned char src_ptr_l2;
+ unsigned char src_ptr_l1;
+ unsigned char src_ptr_0;
+ unsigned char src_ptr_r1;
+ unsigned char src_ptr_r2;
+ unsigned char src_ptr_r3;
+
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ vector4a = 64;
+
+ /* load filter coefficients */
+ vector1b = sub_pel_filterss[yoffset][0];
+ vector2b = sub_pel_filterss[yoffset][2];
+ vector3b = sub_pel_filterss[yoffset][1];
+
+ if (vector1b) {
+ /* 6 tap filter */
+
+ for (i = 2; i--;) {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr);
+
+ /* do not allow compiler to reorder instructions */
+ __asm__ __volatile__(
+ ".set noreorder \n\t"
+ :
+ :);
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "lbu %[src_ptr_l2], -8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 12(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 13(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 14(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 15(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
+ [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
+ [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2),
+ [src_ptr_r3] "=&r"(src_ptr_r3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
+ [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+
+ output_ptr += output_pitch;
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "lbu %[src_ptr_l2], -4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 16(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 17(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 18(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 19(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
+ [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
+ [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2),
+ [src_ptr_r3] "=&r"(src_ptr_r3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
+ [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+
+ src_ptr += 8;
+ output_ptr += output_pitch;
+ }
+ } else {
+ /* 4 tap filter */
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr);
+
+ for (i = 2; i--;) {
+ /* do not allow compiler to reorder instructions */
+ __asm__ __volatile__(
+ ".set noreorder \n\t"
+ :
+ :);
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
+ [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
+ [src_ptr_r2] "=&r"(src_ptr_r2)
+ : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
+ [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+
+ output_ptr += output_pitch;
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
+ [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
+ [src_ptr_r2] "=&r"(src_ptr_r2)
+ : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
+ [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+
+ src_ptr += 8;
+ output_ptr += output_pitch;
+ }
+ }
+}
+
+void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT output_ptr,
+ int output_pitch,
+ unsigned int output_height,
+ unsigned int output_width,
+ unsigned int yoffset) {
+ unsigned int i;
+
+ int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
+ unsigned int vector1b, vector2b, vector3b, vector4a;
+
+ unsigned char src_ptr_l2;
+ unsigned char src_ptr_l1;
+ unsigned char src_ptr_0;
+ unsigned char src_ptr_r1;
+ unsigned char src_ptr_r2;
+ unsigned char src_ptr_r3;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+ (void)output_width;
+
+ vector4a = 64;
+
+ vector1b = sub_pel_filterss[yoffset][0];
+ vector2b = sub_pel_filterss[yoffset][2];
+ vector3b = sub_pel_filterss[yoffset][1];
+
+ if (vector1b) {
+ /* 6 tap filter */
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr);
+
+ for (i = output_height; i--;) {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "lbu %[src_ptr_l2], -16(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 24(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -15(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 25(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -14(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 18(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 26(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -13(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 19(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 27(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
+ [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
+ [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
+ [src_ptr] "r"(src_ptr));
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "lbu %[src_ptr_l2], -12(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 12(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 20(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 28(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ "lbu %[src_ptr_l2], -11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 13(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 21(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 29(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp5], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 14(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 22(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 30(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp6], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 15(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 23(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 31(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp7], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp8], $ac1, 9 \n\t"
+
+ : [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
+ [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
+ [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
+ [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
+ [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
+ [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+ output_ptr[4] = cm[Temp5];
+ output_ptr[5] = cm[Temp6];
+ output_ptr[6] = cm[Temp7];
+ output_ptr[7] = cm[Temp8];
+
+ src_ptr += 8;
+ output_ptr += output_pitch;
+ }
+ } else {
+ /* 4 tap filter */
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr);
+
+ for (i = output_height; i--;) {
+ __asm__ __volatile__(
+ "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ : [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
+ [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2)
+ : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
+ [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
+
+ __asm__ __volatile__(
+ "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ : [Temp1] "=r"(Temp1), [src_ptr_l1] "=&r"(src_ptr_l1),
+ [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
+ [src_ptr_r2] "=&r"(src_ptr_r2)
+ : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
+ [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
+
+ src_ptr_l1 = src_ptr[-6];
+ src_ptr_0 = src_ptr[2];
+ src_ptr_r1 = src_ptr[10];
+ src_ptr_r2 = src_ptr[18];
+
+ __asm__ __volatile__(
+ "mtlo %[vector4a], $ac0 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ : [Temp2] "=r"(Temp2)
+ : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
+ [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
+ [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
+ [vector4a] "r"(vector4a));
+
+ src_ptr_l1 = src_ptr[-5];
+ src_ptr_0 = src_ptr[3];
+ src_ptr_r1 = src_ptr[11];
+ src_ptr_r2 = src_ptr[19];
+
+ __asm__ __volatile__(
+ "mtlo %[vector4a], $ac1 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+
+ : [Temp3] "=r"(Temp3)
+ : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
+ [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
+ [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
+ [vector4a] "r"(vector4a));
+
+ src_ptr_l1 = src_ptr[-4];
+ src_ptr_0 = src_ptr[4];
+ src_ptr_r1 = src_ptr[12];
+ src_ptr_r2 = src_ptr[20];
+
+ __asm__ __volatile__(
+ "mtlo %[vector4a], $ac2 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ : [Temp4] "=r"(Temp4)
+ : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
+ [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
+ [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
+ [vector4a] "r"(vector4a));
+
+ src_ptr_l1 = src_ptr[-3];
+ src_ptr_0 = src_ptr[5];
+ src_ptr_r1 = src_ptr[13];
+ src_ptr_r2 = src_ptr[21];
+
+ __asm__ __volatile__(
+ "mtlo %[vector4a], $ac3 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp5], $ac2, 9 \n\t"
+
+ : [Temp5] "=&r"(Temp5)
+ : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
+ [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
+ [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
+ [vector4a] "r"(vector4a));
+
+ src_ptr_l1 = src_ptr[-2];
+ src_ptr_0 = src_ptr[6];
+ src_ptr_r1 = src_ptr[14];
+ src_ptr_r2 = src_ptr[22];
+
+ __asm__ __volatile__(
+ "mtlo %[vector4a], $ac0 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp6], $ac3, 9 \n\t"
+
+ : [Temp6] "=r"(Temp6)
+ : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
+ [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
+ [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
+ [vector4a] "r"(vector4a));
+
+ src_ptr_l1 = src_ptr[-1];
+ src_ptr_0 = src_ptr[7];
+ src_ptr_r1 = src_ptr[15];
+ src_ptr_r2 = src_ptr[23];
+
+ __asm__ __volatile__(
+ "mtlo %[vector4a], $ac1 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp7], $ac0, 9 \n\t"
+ "extp %[Temp8], $ac1, 9 \n\t"
+
+ : [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8)
+ : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
+ [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
+ [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
+ [vector4a] "r"(vector4a));
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+ output_ptr[4] = cm[Temp5];
+ output_ptr[5] = cm[Temp6];
+ output_ptr[6] = cm[Temp7];
+ output_ptr[7] = cm[Temp8];
+
+ src_ptr += 8;
+ output_ptr += output_pitch;
+ }
+ }
+}
+
+void vp8_filter_block2d_second_pass161(unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT output_ptr,
+ int output_pitch,
+ const unsigned short *vp8_filter) {
+ unsigned int i, j;
+
+ int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
+ unsigned int vector4a;
+ unsigned int vector1b, vector2b, vector3b;
+
+ unsigned char src_ptr_l2;
+ unsigned char src_ptr_l1;
+ unsigned char src_ptr_0;
+ unsigned char src_ptr_r1;
+ unsigned char src_ptr_r2;
+ unsigned char src_ptr_r3;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ vector4a = 64;
+
+ vector1b = vp8_filter[0];
+ vector2b = vp8_filter[2];
+ vector3b = vp8_filter[1];
+
+ if (vector1b == 0) {
+ /* 4 tap filter */
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + 16);
+
+ for (i = 16; i--;) {
+ /* unrolling for loop */
+ for (j = 0; j < 16; j += 8) {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "lbu %[src_ptr_l1], -16(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r1], 16(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r2], 32(%[src_ptr]) "
+ "\n\t"
+ "mtlo %[vector4a], $ac2 "
+ "\n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 "
+ "\n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 "
+ "\n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] "
+ "\n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] "
+ "\n\t"
+
+ "lbu %[src_ptr_l1], -15(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r1], 17(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r2], 33(%[src_ptr]) "
+ "\n\t"
+ "mtlo %[vector4a], $ac3 "
+ "\n\t"
+ "extp %[Temp1], $ac2, 9 "
+ "\n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 "
+ "\n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 "
+ "\n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] "
+ "\n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] "
+ "\n\t"
+
+ "lbu %[src_ptr_l1], -14(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_0], 2(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r1], 18(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r2], 34(%[src_ptr]) "
+ "\n\t"
+ "mtlo %[vector4a], $ac1 "
+ "\n\t"
+ "extp %[Temp2], $ac3, 9 "
+ "\n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 "
+ "\n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 "
+ "\n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] "
+ "\n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] "
+ "\n\t"
+
+ "lbu %[src_ptr_l1], -13(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_0], 3(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r1], 19(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r2], 35(%[src_ptr]) "
+ "\n\t"
+ "mtlo %[vector4a], $ac3 "
+ "\n\t"
+ "extp %[Temp3], $ac1, 9 "
+ "\n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 "
+ "\n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 "
+ "\n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] "
+ "\n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] "
+ "\n\t"
+
+ "lbu %[src_ptr_l1], -12(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_0], 4(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r1], 20(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r2], 36(%[src_ptr]) "
+ "\n\t"
+ "mtlo %[vector4a], $ac2 "
+ "\n\t"
+ "extp %[Temp4], $ac3, 9 "
+ "\n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 "
+ "\n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 "
+ "\n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] "
+ "\n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] "
+ "\n\t"
+
+ "lbu %[src_ptr_l1], -11(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_0], 5(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r1], 21(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r2], 37(%[src_ptr]) "
+ "\n\t"
+ "mtlo %[vector4a], $ac3 "
+ "\n\t"
+ "extp %[Temp5], $ac2, 9 "
+ "\n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 "
+ "\n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 "
+ "\n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] "
+ "\n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] "
+ "\n\t"
+
+ "lbu %[src_ptr_l1], -10(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_0], 6(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r1], 22(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r2], 38(%[src_ptr]) "
+ "\n\t"
+ "mtlo %[vector4a], $ac1 "
+ "\n\t"
+ "extp %[Temp6], $ac3, 9 "
+ "\n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 "
+ "\n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 "
+ "\n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] "
+ "\n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] "
+ "\n\t"
+
+ "lbu %[src_ptr_l1], -9(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_0], 7(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r1], 23(%[src_ptr]) "
+ "\n\t"
+ "lbu %[src_ptr_r2], 39(%[src_ptr]) "
+ "\n\t"
+ "mtlo %[vector4a], $ac3 "
+ "\n\t"
+ "extp %[Temp7], $ac1, 9 "
+ "\n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 "
+ "\n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 "
+ "\n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] "
+ "\n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] "
+ "\n\t"
+ "extp %[Temp8], $ac3, 9 "
+ "\n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
+ [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
+ [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
+ [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2)
+ : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
+ [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ output_ptr[j] = cm[Temp1];
+ output_ptr[j + 1] = cm[Temp2];
+ output_ptr[j + 2] = cm[Temp3];
+ output_ptr[j + 3] = cm[Temp4];
+ output_ptr[j + 4] = cm[Temp5];
+ output_ptr[j + 5] = cm[Temp6];
+ output_ptr[j + 6] = cm[Temp7];
+ output_ptr[j + 7] = cm[Temp8];
+
+ src_ptr += 8;
+ }
+
+ output_ptr += output_pitch;
+ }
+ } else {
+ /* 4 tap filter */
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + 16);
+
+ /* unroll for loop */
+ for (i = 16; i--;) {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "lbu %[src_ptr_l2], -32(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 48(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -31(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 49(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -30(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 50(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp2], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -29(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 51(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp3], $ac1, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -28(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 52(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "extp %[Temp4], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -27(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 53(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp5], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -26(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 54(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp6], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -25(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 55(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp7], $ac1, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp8], $ac3, 9 \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
+ [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
+ [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
+ [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
+ [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
+ [src_ptr] "r"(src_ptr));
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+ output_ptr[4] = cm[Temp5];
+ output_ptr[5] = cm[Temp6];
+ output_ptr[6] = cm[Temp7];
+ output_ptr[7] = cm[Temp8];
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__(
+ "lbu %[src_ptr_l2], -24(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 24(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 40(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 56(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -23(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 25(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 41(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 57(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -22(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 26(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 42(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 58(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp2], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -21(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 27(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 43(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 59(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp3], $ac1, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -20(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 12(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 28(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 44(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 60(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "extp %[Temp4], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -19(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 13(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 29(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 45(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 61(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp5], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -18(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 14(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 30(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 46(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 62(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp6], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -17(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 15(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 31(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 47(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 63(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp7], $ac1, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp8], $ac3, 9 \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+ [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
+ [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
+ [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
+ [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
+ [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
+ : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+ [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
+ [src_ptr] "r"(src_ptr));
+
+ src_ptr += 16;
+ output_ptr[8] = cm[Temp1];
+ output_ptr[9] = cm[Temp2];
+ output_ptr[10] = cm[Temp3];
+ output_ptr[11] = cm[Temp4];
+ output_ptr[12] = cm[Temp5];
+ output_ptr[13] = cm[Temp6];
+ output_ptr[14] = cm[Temp7];
+ output_ptr[15] = cm[Temp8];
+
+ output_ptr += output_pitch;
+ }
+ }
+}
+
+void vp8_sixtap_predict4x4_dspr2(unsigned char *RESTRICT src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *RESTRICT dst_ptr,
+ int dst_pitch) {
+ unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
+ unsigned int pos = 16;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ if (yoffset) {
+ /* First filter 1-D horizontally... */
+ vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
+ src_pixels_per_line, 9, xoffset, 4);
+ /* then filter verticaly... */
+ vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
+ } else
+ /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+ vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line, 4,
+ xoffset, dst_pitch);
+}
+
+void vp8_sixtap_predict8x8_dspr2(unsigned char *RESTRICT src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *RESTRICT dst_ptr,
+ int dst_pitch) {
+ unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
+ unsigned int pos, Temp1, Temp2;
+
+ pos = 16;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ if (yoffset) {
+ src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+ if (xoffset) /* filter 1-D horizontally... */
+ vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
+ 13, xoffset, 8);
+
+ else {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + 2 * src_pixels_per_line);
+
+ __asm__ __volatile__(
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 0(%[FData]) \n\t"
+ "sw %[Temp2], 4(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 8(%[FData]) \n\t"
+ "sw %[Temp2], 12(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 16(%[FData]) \n\t"
+ "sw %[Temp2], 20(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 24(%[FData]) \n\t"
+ "sw %[Temp2], 28(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 32(%[FData]) \n\t"
+ "sw %[Temp2], 36(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 40(%[FData]) \n\t"
+ "sw %[Temp2], 44(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 48(%[FData]) \n\t"
+ "sw %[Temp2], 52(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 56(%[FData]) \n\t"
+ "sw %[Temp2], 60(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 64(%[FData]) \n\t"
+ "sw %[Temp2], 68(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 72(%[FData]) \n\t"
+ "sw %[Temp2], 76(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 80(%[FData]) \n\t"
+ "sw %[Temp2], 84(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 88(%[FData]) \n\t"
+ "sw %[Temp2], 92(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 96(%[FData]) \n\t"
+ "sw %[Temp2], 100(%[FData]) \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
+ : [FData] "r"(FData), [src_ptr] "r"(src_ptr),
+ [src_pixels_per_line] "r"(src_pixels_per_line));
+ }
+
+ /* filter verticaly... */
+ vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8,
+ yoffset);
+ }
+
+ /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+ else {
+ if (xoffset)
+ vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
+ 8, xoffset, dst_pitch);
+
+ else {
+ /* copy from src buffer to dst buffer */
+ __asm__ __volatile__(
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 0(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 4(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 8(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 12(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 16(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 20(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 24(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 28(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 32(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 36(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 40(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 44(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 48(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 52(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 56(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 60(%[dst_ptr]) \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
+ : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
+ [src_pixels_per_line] "r"(src_pixels_per_line));
+ }
+ }
+}
+
+void vp8_sixtap_predict8x4_dspr2(unsigned char *RESTRICT src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *RESTRICT dst_ptr,
+ int dst_pitch) {
+ unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
+ unsigned int pos, Temp1, Temp2;
+
+ pos = 16;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ if (yoffset) {
+ src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+ if (xoffset) /* filter 1-D horizontally... */
+ vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
+ 9, xoffset, 8);
+
+ else {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + 2 * src_pixels_per_line);
+
+ __asm__ __volatile__(
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 0(%[FData]) \n\t"
+ "sw %[Temp2], 4(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 8(%[FData]) \n\t"
+ "sw %[Temp2], 12(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 16(%[FData]) \n\t"
+ "sw %[Temp2], 20(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 24(%[FData]) \n\t"
+ "sw %[Temp2], 28(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 32(%[FData]) \n\t"
+ "sw %[Temp2], 36(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 40(%[FData]) \n\t"
+ "sw %[Temp2], 44(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 48(%[FData]) \n\t"
+ "sw %[Temp2], 52(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 56(%[FData]) \n\t"
+ "sw %[Temp2], 60(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 64(%[FData]) \n\t"
+ "sw %[Temp2], 68(%[FData]) \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
+ : [FData] "r"(FData), [src_ptr] "r"(src_ptr),
+ [src_pixels_per_line] "r"(src_pixels_per_line));
+ }
+
+ /* filter verticaly... */
+ vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8,
+ yoffset);
+ }
+
+ /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+ else {
+ if (xoffset)
+ vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
+ 4, xoffset, dst_pitch);
+
+ else {
+ /* copy from src buffer to dst buffer */
+ __asm__ __volatile__(
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 0(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 4(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 8(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 12(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 16(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 20(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 24(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 28(%[dst_ptr]) \n\t"
+
+ : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
+ : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
+ [src_pixels_per_line] "r"(src_pixels_per_line));
+ }
+ }
+}
+
+void vp8_sixtap_predict16x16_dspr2(unsigned char *RESTRICT src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *RESTRICT dst_ptr,
+ int dst_pitch) {
+ const unsigned short *VFilter;
+ unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
+ unsigned int pos;
+
+ VFilter = sub_pel_filterss[yoffset];
+
+ pos = 16;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r"(pos));
+
+ if (yoffset) {
+ src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+ switch (xoffset) {
+ /* filter 1-D horizontally... */
+ case 2:
+ case 4:
+ case 6:
+ /* 6 tap filter */
+ vp8_filter_block2d_first_pass16_6tap(
+ src_ptr, FData, src_pixels_per_line, 21, xoffset, 16);
+ break;
+
+ case 0:
+ /* only copy buffer */
+ vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ /* 4 tap filter */
+ vp8_filter_block2d_first_pass16_4tap(
+ src_ptr, FData, src_pixels_per_line, 16, 21, xoffset, yoffset,
+ dst_ptr, dst_pitch);
+ break;
+ }
+
+ /* filter verticaly... */
+ vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
+ } else {
+ /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+ switch (xoffset) {
+ case 2:
+ case 4:
+ case 6:
+ /* 6 tap filter */
+ vp8_filter_block2d_first_pass16_6tap(
+ src_ptr, dst_ptr, src_pixels_per_line, 16, xoffset, dst_pitch);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ /* 4 tap filter */
+ vp8_filter_block2d_first_pass16_4tap(
+ src_ptr, dst_ptr, src_pixels_per_line, 16, 21, xoffset, yoffset,
+ dst_ptr, dst_pitch);
+ break;
+ }
+ }
+}
+
+#endif
diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
new file mode 100644
index 0000000000..eae852d592
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+#if HAVE_DSPR2
+
+void vp8_dequant_idct_add_y_block_dspr2(short *q, short *dq, unsigned char *dst,
+ int stride, char *eobs) {
+ int i, j;
+
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_dspr2(q, dq, dst, stride);
+ else {
+ vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst, stride, dst, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ dst += 4;
+ }
+
+ dst += 4 * stride - 16;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_dspr2(short *q, short *dq,
+ unsigned char *dst_u,
+ unsigned char *dst_v, int stride,
+ char *eobs) {
+ int i, j;
+
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < 2; ++j) {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_dspr2(q, dq, dst_u, stride);
+ else {
+ vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_u, stride, dst_u, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ dst_u += 4;
+ }
+
+ dst_u += 4 * stride - 8;
+ }
+
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < 2; ++j) {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_dspr2(q, dq, dst_v, stride);
+ else {
+ vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_v, stride, dst_v, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ dst_v += 4;
+ }
+
+ dst_v += 4 * stride - 8;
+ }
+}
+
+#endif
diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c
new file mode 100644
index 0000000000..9163ffad1e
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8_rtcd.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 256
+
+/******************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point version of two multiply
+ * constants:
+ * 1. sqrt(2) * cos (pi/8)
+ * 2. sqrt(2) * sin (pi/8)
+ * Since the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ * x * a = x + x*(a-1)
+ * so
+ * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ ****************************************************************************/
+extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2 = 35468;
+
+inline void prefetch_load_short(short *src) {
+ __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src));
+}
+
+void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride) {
+ int r, c;
+ int a1, b1, c1, d1;
+ short output[16];
+ short *ip = input;
+ short *op = output;
+ int temp1, temp2;
+ int shortpitch = 4;
+
+ int c2, d2;
+ int temp3, temp4;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ /* prepare data for load */
+ prefetch_load_short(ip + 8);
+
+ /* first loop is unrolled */
+ a1 = ip[0] + ip[8];
+ b1 = ip[0] - ip[8];
+
+ temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+ temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[12] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ temp3 = (ip[5] * sinpi8sqrt2) >> 16;
+ temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
+ c2 = temp3 - temp4;
+
+ temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
+ temp4 = (ip[13] * sinpi8sqrt2) >> 16;
+ d2 = temp3 + temp4;
+
+ op[0] = a1 + d1;
+ op[12] = a1 - d1;
+ op[4] = b1 + c1;
+ op[8] = b1 - c1;
+
+ a1 = ip[1] + ip[9];
+ b1 = ip[1] - ip[9];
+
+ op[1] = a1 + d2;
+ op[13] = a1 - d2;
+ op[5] = b1 + c2;
+ op[9] = b1 - c2;
+
+ a1 = ip[2] + ip[10];
+ b1 = ip[2] - ip[10];
+
+ temp1 = (ip[6] * sinpi8sqrt2) >> 16;
+ temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[14] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ temp3 = (ip[7] * sinpi8sqrt2) >> 16;
+ temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
+ c2 = temp3 - temp4;
+
+ temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
+ temp4 = (ip[15] * sinpi8sqrt2) >> 16;
+ d2 = temp3 + temp4;
+
+ op[2] = a1 + d1;
+ op[14] = a1 - d1;
+ op[6] = b1 + c1;
+ op[10] = b1 - c1;
+
+ a1 = ip[3] + ip[11];
+ b1 = ip[3] - ip[11];
+
+ op[3] = a1 + d2;
+ op[15] = a1 - d2;
+ op[7] = b1 + c2;
+ op[11] = b1 - c2;
+
+ ip = output;
+
+ /* prepare data for load */
+ prefetch_load_short(ip + shortpitch);
+
+ /* second loop is unrolled */
+ a1 = ip[0] + ip[2];
+ b1 = ip[0] - ip[2];
+
+ temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+ temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[3] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ temp3 = (ip[5] * sinpi8sqrt2) >> 16;
+ temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
+ c2 = temp3 - temp4;
+
+ temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
+ temp4 = (ip[7] * sinpi8sqrt2) >> 16;
+ d2 = temp3 + temp4;
+
+ op[0] = (a1 + d1 + 4) >> 3;
+ op[3] = (a1 - d1 + 4) >> 3;
+ op[1] = (b1 + c1 + 4) >> 3;
+ op[2] = (b1 - c1 + 4) >> 3;
+
+ a1 = ip[4] + ip[6];
+ b1 = ip[4] - ip[6];
+
+ op[4] = (a1 + d2 + 4) >> 3;
+ op[7] = (a1 - d2 + 4) >> 3;
+ op[5] = (b1 + c2 + 4) >> 3;
+ op[6] = (b1 - c2 + 4) >> 3;
+
+ a1 = ip[8] + ip[10];
+ b1 = ip[8] - ip[10];
+
+ temp1 = (ip[9] * sinpi8sqrt2) >> 16;
+ temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[11] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ temp3 = (ip[13] * sinpi8sqrt2) >> 16;
+ temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
+ c2 = temp3 - temp4;
+
+ temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
+ temp4 = (ip[15] * sinpi8sqrt2) >> 16;
+ d2 = temp3 + temp4;
+
+ op[8] = (a1 + d1 + 4) >> 3;
+ op[11] = (a1 - d1 + 4) >> 3;
+ op[9] = (b1 + c1 + 4) >> 3;
+ op[10] = (b1 - c1 + 4) >> 3;
+
+ a1 = ip[12] + ip[14];
+ b1 = ip[12] - ip[14];
+
+ op[12] = (a1 + d2 + 4) >> 3;
+ op[15] = (a1 - d2 + 4) >> 3;
+ op[13] = (b1 + c2 + 4) >> 3;
+ op[14] = (b1 - c2 + 4) >> 3;
+
+ ip = output;
+
+ for (r = 0; r < 4; ++r) {
+ for (c = 0; c < 4; ++c) {
+ short a = ip[c] + pred_ptr[c];
+ dst_ptr[c] = cm[a];
+ }
+
+ ip += 4;
+ dst_ptr += dst_stride;
+ pred_ptr += pred_stride;
+ }
+}
+
+void vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride) {
+ int a1;
+ int i, absa1;
+ int t2, vector_a1, vector_a;
+
+ /* a1 = ((input_dc + 4) >> 3); */
+ __asm__ __volatile__(
+ "addi %[a1], %[input_dc], 4 \n\t"
+ "sra %[a1], %[a1], 3 \n\t"
+ : [a1] "=r"(a1)
+ : [input_dc] "r"(input_dc));
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned
+ */
+ __asm__ __volatile__(
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+ : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ /* use (a1 - predptr[c]) instead a1 + predptr[c] */
+ for (i = 4; i--;) {
+ __asm__ __volatile__(
+ "lw %[t2], 0(%[pred_ptr]) \n\t"
+ "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t"
+ "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dst_ptr]) \n\t"
+ "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+ : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a),
+ [dst_ptr] "+&r"(dst_ptr), [pred_ptr] "+&r"(pred_ptr)
+ : [dst_stride] "r"(dst_stride), [pred_stride] "r"(pred_stride),
+ [vector_a1] "r"(vector_a1));
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned
+ */
+ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
+ : [vector_a1] "=r"(vector_a1)
+ : [a1] "r"(a1));
+
+ for (i = 4; i--;) {
+ __asm__ __volatile__(
+ "lw %[t2], 0(%[pred_ptr]) \n\t"
+ "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a1], %[t2] \n\t"
+ "sw %[vector_a], 0(%[dst_ptr]) \n\t"
+ "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+ : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a),
+ [dst_ptr] "+&r"(dst_ptr), [pred_ptr] "+&r"(pred_ptr)
+ : [dst_stride] "r"(dst_stride), [pred_stride] "r"(pred_stride),
+ [vector_a1] "r"(vector_a1));
+ }
+ }
+}
+
+void vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff) {
+ short output[16];
+ int i;
+ int a1, b1, c1, d1;
+ int a2, b2, c2, d2;
+ short *ip = input;
+ short *op = output;
+
+ prefetch_load_short(ip);
+
+ for (i = 4; i--;) {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+
+ op[0] = a1 + b1;
+ op[4] = c1 + d1;
+ op[8] = a1 - b1;
+ op[12] = d1 - c1;
+
+ ip++;
+ op++;
+ }
+
+ ip = output;
+ op = output;
+
+ prefetch_load_short(ip);
+
+ for (i = 4; i--;) {
+ a1 = ip[0] + ip[3] + 3;
+ b1 = ip[1] + ip[2];
+ c1 = ip[1] - ip[2];
+ d1 = ip[0] - ip[3] + 3;
+
+ a2 = a1 + b1;
+ b2 = d1 + c1;
+ c2 = a1 - b1;
+ d2 = d1 - c1;
+
+ op[0] = a2 >> 3;
+ op[1] = b2 >> 3;
+ op[2] = c2 >> 3;
+ op[3] = d2 >> 3;
+
+ ip += 4;
+ op += 4;
+ }
+
+ for (i = 0; i < 16; ++i) {
+ mb_dqcoeff[i * 16] = output[i];
+ }
+}
+
+void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff) {
+ int a1;
+
+ a1 = ((input[0] + 3) >> 3);
+
+ __asm__ __volatile__(
+ "sh %[a1], 0(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 32(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 64(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 96(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 128(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 160(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 192(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 224(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 256(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 288(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 320(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 352(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 384(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 416(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 448(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 480(%[mb_dqcoeff]) \n\t"
+
+ :
+ : [a1] "r"(a1), [mb_dqcoeff] "r"(mb_dqcoeff));
+}
+
+#endif
diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c
new file mode 100644
index 0000000000..e44ae29278
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_DSPR2
+inline void prefetch_load_int(unsigned char *src) {
+ __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src));
+}
+
+__inline void vp8_copy_mem16x16_dspr2(unsigned char *RESTRICT src,
+ int src_stride,
+ unsigned char *RESTRICT dst,
+ int dst_stride) {
+ int r;
+ unsigned int a0, a1, a2, a3;
+
+ for (r = 16; r--;) {
+ /* load src data in cache memory */
+ prefetch_load_int(src + src_stride);
+
+ /* use unaligned memory load and store */
+ __asm__ __volatile__(
+ "ulw %[a0], 0(%[src]) \n\t"
+ "ulw %[a1], 4(%[src]) \n\t"
+ "ulw %[a2], 8(%[src]) \n\t"
+ "ulw %[a3], 12(%[src]) \n\t"
+ "sw %[a0], 0(%[dst]) \n\t"
+ "sw %[a1], 4(%[dst]) \n\t"
+ "sw %[a2], 8(%[dst]) \n\t"
+ "sw %[a3], 12(%[dst]) \n\t"
+ : [a0] "=&r"(a0), [a1] "=&r"(a1), [a2] "=&r"(a2), [a3] "=&r"(a3)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+__inline void vp8_copy_mem8x8_dspr2(unsigned char *RESTRICT src, int src_stride,
+ unsigned char *RESTRICT dst,
+ int dst_stride) {
+ int r;
+ unsigned int a0, a1;
+
+ /* load src data in cache memory */
+ prefetch_load_int(src + src_stride);
+
+ for (r = 8; r--;) {
+ /* use unaligned memory load and store */
+ __asm__ __volatile__(
+ "ulw %[a0], 0(%[src]) \n\t"
+ "ulw %[a1], 4(%[src]) \n\t"
+ "sw %[a0], 0(%[dst]) \n\t"
+ "sw %[a1], 4(%[dst]) \n\t"
+ : [a0] "=&r"(a0), [a1] "=&r"(a1)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+__inline void vp8_copy_mem8x4_dspr2(unsigned char *RESTRICT src, int src_stride,
+ unsigned char *RESTRICT dst,
+ int dst_stride) {
+ int r;
+ unsigned int a0, a1;
+
+ /* load src data in cache memory */
+ prefetch_load_int(src + src_stride);
+
+ for (r = 4; r--;) {
+ /* use unaligned memory load and store */
+ __asm__ __volatile__(
+ "ulw %[a0], 0(%[src]) \n\t"
+ "ulw %[a1], 4(%[src]) \n\t"
+ "sw %[a0], 0(%[dst]) \n\t"
+ "sw %[a1], 4(%[dst]) \n\t"
+ : [a0] "=&r"(a0), [a1] "=&r"(a1)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+#endif
diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
new file mode 100644
index 0000000000..21446fb413
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
@@ -0,0 +1,2401 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "vp8_rtcd.h"
+#include "vp8/common/onyxc_int.h"
+
+#if HAVE_DSPR2
+typedef unsigned char uc;
+
+/* prefetch data for load */
+inline void prefetch_load_lf(unsigned char *src) {
+ __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src));
+}
+
+/* prefetch data for store */
+inline void prefetch_store_lf(unsigned char *dst) {
+ __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst));
+}
+
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function
+ */
+static __inline void vp8_filter_mask_vec_mips(
+ uint32_t limit, uint32_t flimit, uint32_t p1, uint32_t p0, uint32_t p3,
+ uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, uint32_t q3,
+ uint32_t thresh, uint32_t *hev, uint32_t *mask) {
+ uint32_t c, r, r3, r_k;
+ uint32_t s1, s2, s3;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t hev1;
+
+ __asm__ __volatile__(
+ /* mask |= (abs(p3 - p2) > limit) */
+ "subu_s.qb %[c], %[p3], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], $0, %[c] \n\t"
+
+ /* mask |= (abs(p2 - p1) > limit) */
+ "subu_s.qb %[c], %[p2], %[p1] \n\t"
+ "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(p1 - p0) > limit)
+ * hev |= (abs(p1 - p0) > thresh)
+ */
+ "subu_s.qb %[c], %[p1], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], $0, %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(q1 - q0) > limit)
+ * hev |= (abs(q1 - q0) > thresh)
+ */
+ "subu_s.qb %[c], %[q1], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], %[r3], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(q2 - q1) > limit) */
+ "subu_s.qb %[c], %[q2], %[q1] \n\t"
+ "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r3], %[r3], 24 \n\t"
+
+ /* mask |= (abs(q3 - q2) > limit) */
+ "subu_s.qb %[c], %[q3], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
+ : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+ [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+ [thresh] "r"(thresh));
+
+ __asm__ __volatile__(
+ /* abs(p0 - q0) */
+ "subu_s.qb %[c], %[p0], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
+ "wrdsp %[r3] \n\t"
+ "or %[s1], %[r_k], %[c] \n\t"
+
+ /* abs(p1 - q1) */
+ "subu_s.qb %[c], %[p1], %[q1] \n\t"
+ "addu_s.qb %[s3], %[s1], %[s1] \n\t"
+ "pick.qb %[hev1], %[ones], $0 \n\t"
+ "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
+ "or %[s2], %[r_k], %[c] \n\t"
+
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
+ "shrl.qb %[s2], %[s2], 1 \n\t"
+ "addu_s.qb %[s1], %[s2], %[s3] \n\t"
+ "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r], %[r], 24 \n\t"
+
+ "wrdsp %[r] \n\t"
+ "pick.qb %[s2], $0, %[ones] \n\t"
+
+ : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+ [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+ : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+ [ones] "r"(ones), [flimit] "r"(flimit));
+
+ *hev = hev1;
+ *mask = s2;
+}
+
+/* inputs & outputs are quad-byte vectors */
+static __inline void vp8_filter_mips(uint32_t mask, uint32_t hev, uint32_t *ps1,
+ uint32_t *ps0, uint32_t *qs0,
+ uint32_t *qs1) {
+ int32_t vp8_filter_l, vp8_filter_r;
+ int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+ int32_t subr_r, subr_l;
+ uint32_t t1, t2, HWM, t3;
+ uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+
+ int32_t vps1, vps0, vqs0, vqs1;
+ int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+ uint32_t N128;
+
+ N128 = 0x80808080;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+ t3 = 0x01000100;
+ HWM = 0xFF00FF00;
+
+ vps0 = (*ps0) ^ N128;
+ vps1 = (*ps1) ^ N128;
+ vqs0 = (*qs0) ^ N128;
+ vqs1 = (*qs1) ^ N128;
+
+ /* use halfword pairs instead quad-bytes because of accuracy */
+ vps0_l = vps0 & HWM;
+ vps0_r = vps0 << 8;
+ vps0_r = vps0_r & HWM;
+
+ vps1_l = vps1 & HWM;
+ vps1_r = vps1 << 8;
+ vps1_r = vps1_r & HWM;
+
+ vqs0_l = vqs0 & HWM;
+ vqs0_r = vqs0 << 8;
+ vqs0_r = vqs0_r & HWM;
+
+ vqs1_l = vqs1 & HWM;
+ vqs1_r = vqs1 << 8;
+ vqs1_r = vqs1_r & HWM;
+
+ mask_l = mask & HWM;
+ mask_r = mask << 8;
+ mask_r = mask_r & HWM;
+
+ hev_l = hev & HWM;
+ hev_r = hev << 8;
+ hev_r = hev_r & HWM;
+
+ __asm__ __volatile__(
+ /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
+ "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t"
+ "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t"
+
+ /* qs0 - ps0 */
+ "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
+ "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
+
+ /* vp8_filter &= hev; */
+ "and %[vp8_filter_l], %[vp8_filter_l], %[hev_l] \n\t"
+ "and %[vp8_filter_r], %[vp8_filter_r], %[hev_r] \n\t"
+
+ /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+
+ /* vp8_filter &= mask; */
+ "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t"
+ "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t"
+
+ : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=&r"(vp8_filter_r),
+ [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+ [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+
+ : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+ [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+ [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+ [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+ [HWM] "r"(HWM));
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ __asm__ __volatile__(
+ /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
+ "addq_s.ph %[Filter1_l], %[vp8_filter_l], %[t2] \n\t"
+ "addq_s.ph %[Filter1_r], %[vp8_filter_r], %[t2] \n\t"
+
+ /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
+ "addq_s.ph %[Filter2_l], %[vp8_filter_l], %[t1] \n\t"
+ "addq_s.ph %[Filter2_r], %[vp8_filter_r], %[t1] \n\t"
+ "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
+ "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
+
+ "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
+
+ /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
+
+ /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+ [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+ [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+ [vqs0_r] "+r"(vqs0_r)
+
+ : [t1] "r"(t1), [t2] "r"(t2), [vp8_filter_l] "r"(vp8_filter_l),
+ [vp8_filter_r] "r"(vp8_filter_r), [HWM] "r"(HWM));
+
+ __asm__ __volatile__(
+ /* (vp8_filter += 1) >>= 1 */
+ "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
+ "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
+
+ /* vp8_filter &= ~hev; */
+ "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
+
+ /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
+ "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
+ "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
+
+ /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
+ "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+ [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+ [vqs1_r] "+r"(vqs1_r)
+
+ : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
+
+ /* Create quad-bytes from halfword pairs */
+ vqs0_l = vqs0_l & HWM;
+ vqs1_l = vqs1_l & HWM;
+ vps0_l = vps0_l & HWM;
+ vps1_l = vps1_l & HWM;
+
+ __asm__ __volatile__(
+ "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
+ "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
+ "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
+ "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
+
+ : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+ [vqs0_r] "+r"(vqs0_r)
+ :);
+
+ vqs0 = vqs0_l | vqs0_r;
+ vqs1 = vqs1_l | vqs1_r;
+ vps0 = vps0_l | vps0_r;
+ vps1 = vps1_l | vps1_r;
+
+ *ps0 = vps0 ^ N128;
+ *ps1 = vps1 ^ N128;
+ *qs0 = vqs0 ^ N128;
+ *qs1 = vqs1 ^ N128;
+}
+
+void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh, int count) {
+ uint32_t mask;
+ uint32_t hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+ (void)count;
+
+ mask = 0;
+ hev = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+
+ /* prefetch data for store */
+ prefetch_store_lf(s);
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ sm1 = s - (p << 2);
+ s0 = s - p - p - p;
+ s1 = s - p - p;
+ s2 = s - p;
+ s3 = s;
+ s4 = s + p;
+ s5 = s + p + p;
+ s6 = s + p + p + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+}
+
+void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh, int count) {
+ uint32_t mask;
+ uint32_t hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+ (void)count;
+
+ mask = 0;
+ hev = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ sm1 = s - (p << 2);
+ s0 = s - p - p - p;
+ s1 = s - p - p;
+ s2 = s - p;
+ s3 = s;
+ s4 = s + p;
+ s5 = s + p + p;
+ s6 = s + p + p + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+}
+
+void vp8_loop_filter_vertical_edge_mips(unsigned char *s, int p,
+ const unsigned int flimit,
+ const unsigned int limit,
+ const unsigned int thresh, int count) {
+ int i;
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+ hev = 0;
+ mask = 0;
+ i = 0;
+ pm1 = 0;
+ p0 = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+ p5 = 0;
+ p6 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ /* apply filter on 4 pixesl at the same time */
+ do {
+ /* prefetch data for store */
+ prefetch_store_lf(s + p);
+
+ s1 = s;
+ s2 = s + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+ s = s4 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+ [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood
+ * don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2),
+ [p1] "r"(p1));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ : [p1] "+r"(p1)
+ : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2),
+ [p1] "r"(p1));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2),
+ [p1] "r"(p1));
+ }
+ }
+
+ s1 = s;
+ s2 = s + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+ s = s4 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+ [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood
+ * don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2),
+ [p1] "r"(p1));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ : [p1] "+r"(p1)
+ : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2),
+ [p1] "r"(p1));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ :
+ : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2),
+ [p1] "r"(p1));
+ }
+ }
+
+ i += 8;
+ }
+
+ while (i < count);
+}
+
+void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh, int count) {
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ (void)count;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ /* apply filter on 4 pixesl at the same time */
+
+ s1 = s;
+ s2 = s + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+ [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood
+ * don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ :
+ :
+ [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ : [p1] "+r"(p1)
+ : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ :
+ :
+ [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ :
+ :
+ [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1));
+ }
+ }
+
+ s1 = s4 + p;
+ s2 = s1 + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+ [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood
+ * don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ :
+ :
+ [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ : [p1] "+r"(p1)
+ : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ :
+ :
+ [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1));
+
+ __asm__ __volatile__(
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ :
+ :
+ [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1));
+ }
+ }
+}
+
+/* inputs & outputs are quad-byte vectors */
+static __inline void vp8_mbfilter_mips(uint32_t mask, uint32_t hev,
+ uint32_t *ps2, uint32_t *ps1,
+ uint32_t *ps0, uint32_t *qs0,
+ uint32_t *qs1, uint32_t *qs2) {
+ int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
+ int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
+ int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
+ uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r,
+ subr_r, subr_l;
+ uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l,
+ invhev_r;
+ uint32_t N128, R63;
+ uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
+
+ R63 = 0x003F003F;
+ HWM = 0xFF00FF00;
+ N128 = 0x80808080;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+
+ vps0 = (*ps0) ^ N128;
+ vps1 = (*ps1) ^ N128;
+ vps2 = (*ps2) ^ N128;
+ vqs0 = (*qs0) ^ N128;
+ vqs1 = (*qs1) ^ N128;
+ vqs2 = (*qs2) ^ N128;
+
+ /* use halfword pairs instead quad-bytes because of accuracy */
+ vps0_l = vps0 & HWM;
+ vps0_r = vps0 << 8;
+ vps0_r = vps0_r & HWM;
+
+ vqs0_l = vqs0 & HWM;
+ vqs0_r = vqs0 << 8;
+ vqs0_r = vqs0_r & HWM;
+
+ vps1_l = vps1 & HWM;
+ vps1_r = vps1 << 8;
+ vps1_r = vps1_r & HWM;
+
+ vqs1_l = vqs1 & HWM;
+ vqs1_r = vqs1 << 8;
+ vqs1_r = vqs1_r & HWM;
+
+ vqs2_l = vqs2 & HWM;
+ vqs2_r = vqs2 << 8;
+ vqs2_r = vqs2_r & HWM;
+
+ __asm__ __volatile__(
+ /* qs0 - ps0 */
+ "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
+ "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
+
+ /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
+ "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t"
+ "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t"
+
+ : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=r"(vp8_filter_r),
+ [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r)
+ : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+ [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+ [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r));
+
+ vps2_l = vps2 & HWM;
+ vps2_r = vps2 << 8;
+ vps2_r = vps2_r & HWM;
+
+ /* add outer taps if we have high edge variance */
+ __asm__ __volatile__(
+ /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+ "and %[mask_l], %[HWM], %[mask] \n\t"
+ "sll %[mask_r], %[mask], 8 \n\t"
+ "and %[mask_r], %[HWM], %[mask_r] \n\t"
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+ "and %[hev_l], %[HWM], %[hev] \n\t"
+ "sll %[hev_r], %[hev], 8 \n\t"
+ "and %[hev_r], %[HWM], %[hev_r] \n\t"
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+
+ /* vp8_filter &= mask; */
+ "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t"
+ "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t"
+
+ /* Filter2 = vp8_filter & hev; */
+ "and %[Filter2_l], %[vp8_filter_l], %[hev_l] \n\t"
+ "and %[Filter2_r], %[vp8_filter_r], %[hev_r] \n\t"
+
+ : [vp8_filter_l] "+r"(vp8_filter_l), [vp8_filter_r] "+r"(vp8_filter_r),
+ [hev_l] "=&r"(hev_l), [hev_r] "=&r"(hev_r), [mask_l] "=&r"(mask_l),
+ [mask_r] "=&r"(mask_r), [Filter2_l] "=&r"(Filter2_l),
+ [Filter2_r] "=&r"(Filter2_r)
+ : [subr_l] "r"(subr_l), [subr_r] "r"(subr_r), [HWM] "r"(HWM),
+ [hev] "r"(hev), [mask] "r"(mask));
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ __asm__ __volatile__(
+ /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
+ "addq_s.ph %[Filter1_l], %[Filter2_l], %[t2] \n\t"
+ "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
+ "addq_s.ph %[Filter1_r], %[Filter2_r], %[t2] \n\t"
+
+ /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
+ "addq_s.ph %[Filter2_l], %[Filter2_l], %[t1] \n\t"
+ "addq_s.ph %[Filter2_r], %[Filter2_r], %[t1] \n\t"
+
+ "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
+ "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
+ "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
+ "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
+
+ /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
+
+ /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
+
+ : [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r),
+ [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+ [Filter2_l] "+r"(Filter2_l), [Filter2_r] "+r"(Filter2_r),
+ [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+ [vqs0_r] "+r"(vqs0_r)
+ : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), [hev_l] "r"(hev_l),
+ [hev_r] "r"(hev_r));
+
+ /* only apply wider filter if not high edge variance */
+ __asm__ __volatile__(
+ /* vp8_filter &= ~hev; */
+ "and %[Filter2_l], %[vp8_filter_l], %[invhev_l] \n\t"
+ "and %[Filter2_r], %[vp8_filter_r], %[invhev_r] \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 8 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 8 \n\t"
+
+ : [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r)
+ : [vp8_filter_l] "r"(vp8_filter_l), [vp8_filter_r] "r"(vp8_filter_r),
+ [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
+
+ /* roughly 3/7th difference across boundary */
+ __asm__ __volatile__(
+ "shll.ph %[u3_l], %[Filter2_l], 3 \n\t"
+ "shll.ph %[u3_r], %[Filter2_r], 3 \n\t"
+
+ "addq.ph %[u3_l], %[u3_l], %[Filter2_l] \n\t"
+ "addq.ph %[u3_r], %[u3_r], %[Filter2_r] \n\t"
+
+ "shll.ph %[u2_l], %[u3_l], 1 \n\t"
+ "shll.ph %[u2_r], %[u3_r], 1 \n\t"
+
+ "addq.ph %[u1_l], %[u3_l], %[u2_l] \n\t"
+ "addq.ph %[u1_r], %[u3_r], %[u2_r] \n\t"
+
+ "addq.ph %[u2_l], %[u2_l], %[R63] \n\t"
+ "addq.ph %[u2_r], %[u2_r], %[R63] \n\t"
+
+ "addq.ph %[u3_l], %[u3_l], %[R63] \n\t"
+ "addq.ph %[u3_r], %[u3_r], %[R63] \n\t"
+
+ /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
+ * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
+ */
+ "addq.ph %[u1_l], %[u1_l], %[R63] \n\t"
+ "addq.ph %[u1_r], %[u1_r], %[R63] \n\t"
+ "shra.ph %[u1_l], %[u1_l], 7 \n\t"
+ "shra.ph %[u1_r], %[u1_r], 7 \n\t"
+ "shra.ph %[u2_l], %[u2_l], 7 \n\t"
+ "shra.ph %[u2_r], %[u2_r], 7 \n\t"
+ "shll.ph %[u1_l], %[u1_l], 8 \n\t"
+ "shll.ph %[u1_r], %[u1_r], 8 \n\t"
+ "shll.ph %[u2_l], %[u2_l], 8 \n\t"
+ "shll.ph %[u2_r], %[u2_r], 8 \n\t"
+
+ /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[u1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[u1_r] \n\t"
+
+ /* vps0 = vp8_signed_char_clamp(ps0 + u); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[u1_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[u1_r] \n\t"
+
+ : [u1_l] "=&r"(u1_l), [u1_r] "=&r"(u1_r), [u2_l] "=&r"(u2_l),
+ [u2_r] "=&r"(u2_r), [u3_l] "=&r"(u3_l), [u3_r] "=&r"(u3_r),
+ [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+ [vqs0_r] "+r"(vqs0_r)
+ : [R63] "r"(R63), [Filter2_l] "r"(Filter2_l), [Filter2_r] "r"(Filter2_r));
+
+ __asm__ __volatile__(
+ /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
+ "subq_s.ph %[vqs1_l], %[vqs1_l], %[u2_l] \n\t"
+ "addq_s.ph %[vps1_l], %[vps1_l], %[u2_l] \n\t"
+
+ /* vps1 = vp8_signed_char_clamp(ps1 + u); */
+ "addq_s.ph %[vps1_r], %[vps1_r], %[u2_r] \n\t"
+ "subq_s.ph %[vqs1_r], %[vqs1_r], %[u2_r] \n\t"
+
+ : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+ [vqs1_r] "+r"(vqs1_r)
+ : [u2_l] "r"(u2_l), [u2_r] "r"(u2_r));
+
+ /* roughly 1/7th difference across boundary */
+ __asm__ __volatile__(
+ /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
+ "shra.ph %[u3_l], %[u3_l], 7 \n\t"
+ "shra.ph %[u3_r], %[u3_r], 7 \n\t"
+ "shll.ph %[u3_l], %[u3_l], 8 \n\t"
+ "shll.ph %[u3_r], %[u3_r], 8 \n\t"
+
+ /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
+ "subq_s.ph %[vqs2_l], %[vqs2_l], %[u3_l] \n\t"
+ "subq_s.ph %[vqs2_r], %[vqs2_r], %[u3_r] \n\t"
+
+ /* vps2 = vp8_signed_char_clamp(ps2 + u); */
+ "addq_s.ph %[vps2_l], %[vps2_l], %[u3_l] \n\t"
+ "addq_s.ph %[vps2_r], %[vps2_r], %[u3_r] \n\t"
+
+ : [u3_l] "+r"(u3_l), [u3_r] "+r"(u3_r), [vps2_l] "+r"(vps2_l),
+ [vps2_r] "+r"(vps2_r), [vqs2_l] "+r"(vqs2_l), [vqs2_r] "+r"(vqs2_r)
+ :);
+
+ /* Create quad-bytes from halfword pairs */
+ __asm__ __volatile__(
+ "and %[vqs0_l], %[vqs0_l], %[HWM] \n\t"
+ "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
+
+ "and %[vps0_l], %[vps0_l], %[HWM] \n\t"
+ "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
+
+ "and %[vqs1_l], %[vqs1_l], %[HWM] \n\t"
+ "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
+
+ "and %[vps1_l], %[vps1_l], %[HWM] \n\t"
+ "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
+
+ "and %[vqs2_l], %[vqs2_l], %[HWM] \n\t"
+ "shrl.ph %[vqs2_r], %[vqs2_r], 8 \n\t"
+
+ "and %[vps2_l], %[vps2_l], %[HWM] \n\t"
+ "shrl.ph %[vps2_r], %[vps2_r], 8 \n\t"
+
+ "or %[vqs0_r], %[vqs0_l], %[vqs0_r] \n\t"
+ "or %[vps0_r], %[vps0_l], %[vps0_r] \n\t"
+ "or %[vqs1_r], %[vqs1_l], %[vqs1_r] \n\t"
+ "or %[vps1_r], %[vps1_l], %[vps1_r] \n\t"
+ "or %[vqs2_r], %[vqs2_l], %[vqs2_r] \n\t"
+ "or %[vps2_r], %[vps2_l], %[vps2_r] \n\t"
+
+ : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+ [vqs1_r] "+r"(vqs1_r), [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r),
+ [vqs0_l] "+r"(vqs0_l), [vqs0_r] "+r"(vqs0_r), [vqs2_l] "+r"(vqs2_l),
+ [vqs2_r] "+r"(vqs2_r), [vps2_r] "+r"(vps2_r), [vps2_l] "+r"(vps2_l)
+ : [HWM] "r"(HWM));
+
+ *ps0 = vps0_r ^ N128;
+ *ps1 = vps1_r ^ N128;
+ *ps2 = vps2_r ^ N128;
+ *qs0 = vqs0_r ^ N128;
+ *qs1 = vqs1_r ^ N128;
+ *qs2 = vqs2_r ^ N128;
+}
+
+void vp8_mbloop_filter_horizontal_edge_mips(unsigned char *s, int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh, int count) {
+ int i;
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+ mask = 0;
+ hev = 0;
+ i = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ sm1 = s - (p << 2);
+ s0 = s - p - p - p;
+ s1 = s - p - p;
+ s2 = s - p;
+ s3 = s;
+ s4 = s + p;
+ s5 = s + p + p;
+ s6 = s + p + p + p;
+
+ /* prefetch data for load */
+ prefetch_load_lf(s + p);
+
+ /* apply filter on 4 pixesl at the same time */
+ do {
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* unpack processed 4x4 neighborhood
+ * memory is 4 byte aligned
+ */
+ *((uint32_t *)s0) = p0;
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ *((uint32_t *)s5) = p5;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* unpack processed 4x4 neighborhood
+ * memory is 4 byte aligned
+ */
+ *((uint32_t *)s0) = p0;
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ *((uint32_t *)s5) = p5;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ i += 8;
+ }
+
+ while (i < count);
+}
+
+void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh, int count) {
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+ (void)count;
+
+ mask = 0;
+ hev = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ sm1 = s - (p << 2);
+ s0 = s - p - p - p;
+ s1 = s - p - p;
+ s2 = s - p;
+ s3 = s;
+ s4 = s + p;
+ s5 = s + p + p;
+ s6 = s + p + p + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ /* if mask == 0 do filtering is not needed */
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ if (mask) {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* unpack processed 4x4 neighborhood
+ * memory is 4 byte aligned
+ */
+ *((uint32_t *)s0) = p0;
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ *((uint32_t *)s5) = p5;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* unpack processed 4x4 neighborhood
+ * memory is 4 byte aligned
+ */
+ *((uint32_t *)s0) = p0;
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ *((uint32_t *)s5) = p5;
+ }
+ }
+}
+
+void vp8_mbloop_filter_vertical_edge_mips(unsigned char *s, int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh, int count) {
+ int i;
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+ mask = 0;
+ hev = 0;
+ i = 0;
+ pm1 = 0;
+ p0 = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+ p5 = 0;
+ p6 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ /* apply filter on 4 pixesl at the same time */
+ do {
+ s1 = s;
+ s2 = s + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+ s = s4 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+ [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s4]) \n\t"
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ "sb %[p0], -3(%[s4]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+
+ __asm__ __volatile__(
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
+ [p1] "+r"(p1), [p0] "+r"(p0)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s3]) \n\t"
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ "sb %[p0], -3(%[s3]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+
+ __asm__ __volatile__(
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
+ [p1] "+r"(p1), [p0] "+r"(p0)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s2]) \n\t"
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ "sb %[p0], -3(%[s2]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+
+ __asm__ __volatile__(
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
+ [p1] "+r"(p1), [p0] "+r"(p0)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s1]) \n\t"
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ "sb %[p0], -3(%[s1]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+ }
+ }
+
+ i += 4;
+ }
+
+ while (i < count);
+}
+
+void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh, int count) {
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ (void)count;
+
+ mask = 0;
+ hev = 0;
+ pm1 = 0;
+ p0 = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+ p5 = 0;
+ p6 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ /* apply filter on 4 pixesl at the same time */
+
+ s1 = s;
+ s2 = s + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+
+ /* prefetch data for load */
+ prefetch_load_lf(s + 2 * p);
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+ [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s4]) \n\t"
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ "sb %[p0], -3(%[s4]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+
+ __asm__ __volatile__(
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
+ [p1] "+r"(p1), [p0] "+r"(p0)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s3]) \n\t"
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ "sb %[p0], -3(%[s3]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+
+ __asm__ __volatile__(
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
+ [p1] "+r"(p1), [p0] "+r"(p0)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s2]) \n\t"
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ "sb %[p0], -3(%[s2]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+
+ __asm__ __volatile__(
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
+ [p1] "+r"(p1), [p0] "+r"(p0)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s1]) \n\t"
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ "sb %[p0], -3(%[s1]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+ }
+ }
+
+ s1 = s4 + p;
+ s2 = s1 + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+ [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__(
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+ [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+ [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+ :);
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s4]) \n\t"
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ "sb %[p0], -3(%[s4]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+
+ __asm__ __volatile__(
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
+ [p1] "+r"(p1), [p0] "+r"(p0)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s3]) \n\t"
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ "sb %[p0], -3(%[s3]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+
+ __asm__ __volatile__(
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
+ [p1] "+r"(p1), [p0] "+r"(p0)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s2]) \n\t"
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ "sb %[p0], -3(%[s2]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+
+ __asm__ __volatile__(
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
+ [p1] "+r"(p1), [p0] "+r"(p0)
+ :);
+
+ __asm__ __volatile__(
+ "sb %[p5], 2(%[s1]) \n\t"
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ "sb %[p0], -3(%[s1]) \n\t"
+ :
+ : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
+ [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
+ }
+ }
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride,
+ int uv_stride, loop_filter_info *lfi) {
+ unsigned int thresh_vec, flimit_vec, limit_vec;
+ unsigned char thresh, flimit, limit, flimit_temp;
+
+ /* use direct value instead pointers */
+ limit = *(lfi->lim);
+ flimit_temp = *(lfi->mblim);
+ thresh = *(lfi->hev_thr);
+ flimit = flimit_temp;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[thresh] \n\t"
+ "replv.qb %[flimit_vec], %[flimit] \n\t"
+ "replv.qb %[limit_vec], %[limit] \n\t"
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
+
+ vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec,
+ thresh_vec, 16);
+
+ if (u_ptr) {
+ vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec,
+ limit_vec, thresh_vec, 0);
+ }
+
+ if (v_ptr) {
+ vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec,
+ limit_vec, thresh_vec, 0);
+ }
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride,
+ int uv_stride, loop_filter_info *lfi) {
+ unsigned int thresh_vec, flimit_vec, limit_vec;
+ unsigned char thresh, flimit, limit, flimit_temp;
+
+ /* use direct value instead pointers */
+ limit = *(lfi->lim);
+ flimit_temp = *(lfi->mblim);
+ thresh = *(lfi->hev_thr);
+ flimit = flimit_temp;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[thresh] \n\t"
+ "replv.qb %[flimit_vec], %[flimit] \n\t"
+ "replv.qb %[limit_vec], %[limit] \n\t"
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
+
+ vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec,
+ thresh_vec, 16);
+
+ if (u_ptr)
+ vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec,
+ limit_vec, thresh_vec, 0);
+
+ if (v_ptr)
+ vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec,
+ limit_vec, thresh_vec, 0);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ unsigned int thresh_vec, flimit_vec, limit_vec;
+ unsigned char thresh, flimit, limit, flimit_temp;
+
+ /* use direct value instead pointers */
+ limit = *(lfi->lim);
+ flimit_temp = *(lfi->blim);
+ thresh = *(lfi->hev_thr);
+ flimit = flimit_temp;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[thresh] \n\t"
+ "replv.qb %[flimit_vec], %[flimit] \n\t"
+ "replv.qb %[limit_vec], %[limit] \n\t"
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
+
+ vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride,
+ flimit_vec, limit_vec, thresh_vec, 16);
+ vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride,
+ flimit_vec, limit_vec, thresh_vec, 16);
+ vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride,
+ flimit_vec, limit_vec, thresh_vec, 16);
+
+ if (u_ptr)
+ vp8_loop_filter_uvhorizontal_edge_mips(
+ u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+ if (v_ptr)
+ vp8_loop_filter_uvhorizontal_edge_mips(
+ v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ unsigned int thresh_vec, flimit_vec, limit_vec;
+ unsigned char thresh, flimit, limit, flimit_temp;
+
+ /* use direct value instead pointers */
+ limit = *(lfi->lim);
+ flimit_temp = *(lfi->blim);
+ thresh = *(lfi->hev_thr);
+ flimit = flimit_temp;
+
+ /* create quad-byte */
+ __asm__ __volatile__(
+ "replv.qb %[thresh_vec], %[thresh] \n\t"
+ "replv.qb %[flimit_vec], %[flimit] \n\t"
+ "replv.qb %[limit_vec], %[limit] \n\t"
+ : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+ [limit_vec] "=r"(limit_vec)
+ : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
+
+ vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec,
+ thresh_vec, 16);
+ vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec,
+ thresh_vec, 16);
+ vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec,
+ limit_vec, thresh_vec, 16);
+
+ if (u_ptr)
+ vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec,
+ limit_vec, thresh_vec, 0);
+
+ if (v_ptr)
+ vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec,
+ limit_vec, thresh_vec, 0);
+}
+
+#endif
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c
new file mode 100644
index 0000000000..86a32aa9ef
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define COPY_MEM_16X2 \
+ "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \
+ "ldl %[tmp0], 0x0f(%[src]) \n\t" \
+ "ldr %[tmp0], 0x08(%[src]) \n\t" \
+ MMI_ADDU(%[src], %[src], %[src_stride]) \
+ "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \
+ "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \
+ "sdl %[tmp0], 0x0f(%[dst]) \n\t" \
+ "sdr %[tmp0], 0x08(%[dst]) \n\t" \
+ MMI_ADDU(%[dst], %[dst], %[dst_stride]) \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "ldl %[tmp1], 0x0f(%[src]) \n\t" \
+ "ldr %[tmp1], 0x08(%[src]) \n\t" \
+ MMI_ADDU(%[src], %[src], %[src_stride]) \
+ "gssdlc1 %[ftmp1], 0x07(%[dst]) \n\t" \
+ "gssdrc1 %[ftmp1], 0x00(%[dst]) \n\t" \
+ "sdl %[tmp1], 0x0f(%[dst]) \n\t" \
+ "sdr %[tmp1], 0x08(%[dst]) \n\t" \
+ MMI_ADDU(%[dst], %[dst], %[dst_stride])
+
+#define COPY_MEM_8X2 \
+ "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \
+ MMI_ADDU(%[src], %[src], %[src_stride]) \
+ "ldl %[tmp0], 0x07(%[src]) \n\t" \
+ "ldr %[tmp0], 0x00(%[src]) \n\t" \
+ MMI_ADDU(%[src], %[src], %[src_stride]) \
+ \
+ "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \
+ "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \
+ MMI_ADDU(%[dst], %[dst], %[dst_stride]) \
+ "sdl %[tmp0], 0x07(%[dst]) \n\t" \
+ "sdr %[tmp0], 0x00(%[dst]) \n\t" \
+ MMI_ADDU(%[dst], %[dst], %[dst_stride])
+
+void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride) {
+ double ftmp[2];
+ uint64_t tmp[2];
+ uint8_t loop_count = 4;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "1: \n\t"
+ COPY_MEM_16X2
+ COPY_MEM_16X2
+ MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+ "bnez %[loop_count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [loop_count]"+&r"(loop_count),
+ [dst]"+&r"(dst), [src]"+&r"(src)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+ int dst_stride) {
+ double ftmp[2];
+ uint64_t tmp[1];
+ uint8_t loop_count = 4;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "1: \n\t"
+ COPY_MEM_8X2
+ MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+ "bnez %[loop_count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [tmp0]"=&r"(tmp[0]), [loop_count]"+&r"(loop_count),
+ [dst]"+&r"(dst), [src]"+&r"(src)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+ int dst_stride) {
+ double ftmp[2];
+ uint64_t tmp[1];
+
+ /* clang-format off */
+ __asm__ volatile (
+ COPY_MEM_8X2
+ COPY_MEM_8X2
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [tmp0]"=&r"(tmp[0]),
+ [dst]"+&r"(dst), [src]"+&r"(src)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+ /* clang-format on */
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
new file mode 100644
index 0000000000..b9330a6663
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vp8_dequantize_b_mmi(BLOCKD *d, int16_t *DQC) {
+ double ftmp[8];
+
+ __asm__ volatile(
+ "gsldlc1 %[ftmp0], 0x07(%[qcoeff]) \n\t"
+ "gsldrc1 %[ftmp0], 0x00(%[qcoeff]) \n\t"
+ "gsldlc1 %[ftmp1], 0x0f(%[qcoeff]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[qcoeff]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[qcoeff]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[qcoeff]) \n\t"
+ "gsldlc1 %[ftmp3], 0x1f(%[qcoeff]) \n\t"
+ "gsldrc1 %[ftmp3], 0x18(%[qcoeff]) \n\t"
+
+ "gsldlc1 %[ftmp4], 0x07(%[DQC]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[DQC]) \n\t"
+ "gsldlc1 %[ftmp5], 0x0f(%[DQC]) \n\t"
+ "gsldrc1 %[ftmp5], 0x08(%[DQC]) \n\t"
+ "gsldlc1 %[ftmp6], 0x17(%[DQC]) \n\t"
+ "gsldrc1 %[ftmp6], 0x10(%[DQC]) \n\t"
+ "gsldlc1 %[ftmp7], 0x1f(%[DQC]) \n\t"
+ "gsldrc1 %[ftmp7], 0x18(%[DQC]) \n\t"
+
+ "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
+ "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
+ "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+ "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+
+ "gssdlc1 %[ftmp0], 0x07(%[dqcoeff]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[dqcoeff]) \n\t"
+ "gssdlc1 %[ftmp1], 0x0f(%[dqcoeff]) \n\t"
+ "gssdrc1 %[ftmp1], 0x08(%[dqcoeff]) \n\t"
+ "gssdlc1 %[ftmp2], 0x17(%[dqcoeff]) \n\t"
+ "gssdrc1 %[ftmp2], 0x10(%[dqcoeff]) \n\t"
+ "gssdlc1 %[ftmp3], 0x1f(%[dqcoeff]) \n\t"
+ "gssdrc1 %[ftmp3], 0x18(%[dqcoeff]) \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+ : [dqcoeff] "r"(d->dqcoeff), [qcoeff] "r"(d->qcoeff), [DQC] "r"(DQC)
+ : "memory");
+}
+
+void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest,
+ int stride) {
+ double ftmp[8];
+
+ __asm__ volatile(
+ "gsldlc1 %[ftmp0], 0x07(%[dq]) \n\t"
+ "gsldrc1 %[ftmp0], 0x00(%[dq]) \n\t"
+ "gsldlc1 %[ftmp1], 0x0f(%[dq]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[dq]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[dq]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[dq]) \n\t"
+ "gsldlc1 %[ftmp3], 0x1f(%[dq]) \n\t"
+ "gsldrc1 %[ftmp3], 0x18(%[dq]) \n\t"
+
+ "gsldlc1 %[ftmp4], 0x07(%[input]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[input]) \n\t"
+ "gsldlc1 %[ftmp5], 0x0f(%[input]) \n\t"
+ "gsldrc1 %[ftmp5], 0x08(%[input]) \n\t"
+ "gsldlc1 %[ftmp6], 0x17(%[input]) \n\t"
+ "gsldrc1 %[ftmp6], 0x10(%[input]) \n\t"
+ "gsldlc1 %[ftmp7], 0x1f(%[input]) \n\t"
+ "gsldrc1 %[ftmp7], 0x18(%[input]) \n\t"
+
+ "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
+ "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
+ "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+ "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+
+ "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t"
+ "gssdlc1 %[ftmp1], 0x0f(%[input]) \n\t"
+ "gssdrc1 %[ftmp1], 0x08(%[input]) \n\t"
+ "gssdlc1 %[ftmp2], 0x17(%[input]) \n\t"
+ "gssdrc1 %[ftmp2], 0x10(%[input]) \n\t"
+ "gssdlc1 %[ftmp3], 0x1f(%[input]) \n\t"
+ "gssdrc1 %[ftmp3], 0x18(%[input]) \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+ : [dq] "r"(dq), [input] "r"(input)
+ : "memory");
+
+ vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride);
+
+ __asm__ volatile(
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t"
+ "sdl $0, 0x0f(%[input]) \n\t"
+ "sdr $0, 0x08(%[input]) \n\t"
+ "gssdlc1 %[ftmp0], 0x17(%[input]) \n\t"
+ "gssdrc1 %[ftmp0], 0x10(%[input]) \n\t"
+ "sdl $0, 0x1f(%[input]) \n\t"
+ "sdr $0, 0x18(%[input]) \n\t"
+ : [ftmp0] "=&f"(ftmp[0])
+ : [input] "r"(input)
+ : "memory");
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
new file mode 100644
index 0000000000..4fd6854c52
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
+ int stride, char *eobs) {
+ int i, j;
+
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_mmi(q, dq, dst, stride);
+ } else {
+ vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dst += 4;
+ }
+
+ dst += 4 * stride - 16;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst_u,
+ uint8_t *dst_v, int stride, char *eobs) {
+ int i, j;
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_mmi(q, dq, dst_u, stride);
+ } else {
+ vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_u, stride, dst_u, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dst_u += 4;
+ }
+
+ dst_u += 4 * stride - 8;
+ }
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_mmi(q, dq, dst_v, stride);
+ } else {
+ vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_v, stride, dst_v, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dst_v += 4;
+ }
+
+ dst_v += 4 * stride - 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
new file mode 100644
index 0000000000..a35689dd30
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define TRANSPOSE_4H \
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
+ MMI_LI(%[tmp0], 0x93) \
+ "mtc1 %[tmp0], %[ftmp10] \n\t" \
+ "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "por %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
+ "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "por %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \
+ "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "por %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \
+ "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "por %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
+ "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \
+ "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \
+ "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \
+ "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
+
+void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride) {
+ double ftmp[12];
+ uint64_t tmp[1];
+ double ff_ph_04, ff_ph_4e7b, ff_ph_22a3;
+
+ __asm__ volatile (
+ "dli %[tmp0], 0x0004000400040004 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_04] \n\t"
+ "dli %[tmp0], 0x4e7b4e7b4e7b4e7b \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_4e7b] \n\t"
+ "dli %[tmp0], 0x22a322a322a322a3 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_22a3] \n\t"
+ MMI_LI(%[tmp0], 0x02)
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+
+ "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t"
+ "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t"
+ "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t"
+ "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t"
+ "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t"
+
+ // ip[0...3] + ip[8...11]
+ "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ // ip[0...3] - ip[8...11]
+ "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
+ // (ip[12...15] * sinpi8sqrt2) >> 16
+ "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
+ "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
+ // (ip[ 4... 7] * sinpi8sqrt2) >> 16
+ "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
+ "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
+ // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16)
+ "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
+ "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
+ // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16)
+ "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
+ "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
+
+ "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
+ "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
+ "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
+ "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
+ "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
+ "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
+
+ TRANSPOSE_4H
+ // a
+ "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ // b
+ "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
+ // c
+ "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
+ "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
+ "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
+ "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
+ "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
+ // d
+ "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
+ "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
+ "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
+ "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
+ "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
+
+ MMI_LI(%[tmp0], 0x03)
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ // a + d
+ "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_ph_04] \n\t"
+ "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ // b + c
+ "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_04] \n\t"
+ "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+ // b - c
+ "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
+ "paddh %[ftmp3], %[ftmp3], %[ff_ph_04] \n\t"
+ "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+ // a - d
+ "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_04] \n\t"
+ "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+
+ TRANSPOSE_4H
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
+ "mtc1 %[tmp0], %[ftmp5] \n\t"
+#else
+ "gslwlc1 %[ftmp5], 0x03(%[pred_ptr]) \n\t"
+ "gslwrc1 %[ftmp5], 0x00(%[pred_ptr]) \n\t"
+#endif
+ "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
+ "mtc1 %[tmp0], %[ftmp6] \n\t"
+#else
+ "gslwlc1 %[ftmp6], 0x03(%[pred_ptr]) \n\t"
+ "gslwrc1 %[ftmp6], 0x00(%[pred_ptr]) \n\t"
+#endif
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+ "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp2], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp2], 0x00(%[dst_ptr]) \n\t"
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
+ "mtc1 %[tmp0], %[ftmp7] \n\t"
+#else
+ "gslwlc1 %[ftmp7], 0x03(%[pred_ptr]) \n\t"
+ "gslwrc1 %[ftmp7], 0x00(%[pred_ptr]) \n\t"
+#endif
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
+ "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+ "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp3], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp3], 0x00(%[dst_ptr]) \n\t"
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+ "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
+ "mtc1 %[tmp0], %[ftmp8] \n\t"
+#else
+ "gslwlc1 %[ftmp8], 0x03(%[pred_ptr]) \n\t"
+ "gslwrc1 %[ftmp8], 0x00(%[pred_ptr]) \n\t"
+#endif
+ "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
+ "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
+ "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp4], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp4], 0x00(%[dst_ptr]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+ [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+ [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+ [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
+ [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr),
+ [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04),
+ [ff_ph_22a3]"=&f"(ff_ph_22a3)
+ : [ip]"r"(input),
+ [pred_stride]"r"((mips_reg)pred_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+}
+
+void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride) {
+ int a0 = ((input_dc + 4) >> 3);
+ double a1, ftmp[5];
+ int low32;
+
+ __asm__ volatile (
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dmtc1 %[a0], %[a1] \n\t"
+ "pshufh %[a1], %[a1], %[ftmp0] \n\t"
+ "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
+ "mtc1 %[low32], %[ftmp1] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
+ "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
+
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+ "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
+ "mtc1 %[low32], %[ftmp1] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
+ "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
+
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+ "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
+ "mtc1 %[low32], %[ftmp1] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
+ "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
+
+ MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+ MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+ "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
+ "mtc1 %[low32], %[ftmp1] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
+ "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
+ "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+ [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
+ [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1)
+ : [dst_stride]"r"((mips_reg)dst_stride),
+ [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0)
+ : "memory"
+ );
+}
+
+void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
+ int i;
+ int16_t output[16];
+ double ff_ph_03, ftmp[12];
+ uint64_t tmp[1];
+
+ __asm__ volatile (
+ "dli %[tmp0], 0x0003000300030003 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_03] \n\t"
+ MMI_LI(%[tmp0], 0x03)
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t"
+ "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t"
+ "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t"
+ "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t"
+ "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t"
+ "paddh %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
+ "psubh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
+ "paddh %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
+ "psubh %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
+
+ "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
+ "psubh %[ftmp2], %[ftmp5], %[ftmp7] \n\t"
+ "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
+ "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
+
+ TRANSPOSE_4H
+ // a
+ "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
+ // d
+ "psubh %[ftmp6], %[ftmp1], %[ftmp4] \n\t"
+ // b
+ "paddh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
+ // c
+ "psubh %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
+
+ "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
+ "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
+ "psubh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
+ "psubh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
+
+ "paddh %[ftmp1], %[ftmp1], %[ff_ph_03] \n\t"
+ "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_03] \n\t"
+ "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+ "paddh %[ftmp3], %[ftmp3], %[ff_ph_03] \n\t"
+ "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_03] \n\t"
+ "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+
+ TRANSPOSE_4H
+ "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t"
+ "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t"
+ "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t"
+ "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t"
+ "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t"
+ "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t"
+ "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+ [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+ [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+ [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03)
+ : [ip]"r"(input), [op]"r"(output)
+ : "memory"
+ );
+
+ for (i = 0; i < 16; i++) {
+ mb_dqcoeff[i * 16] = output[i];
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
new file mode 100644
index 0000000000..a07a7e3b41
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -0,0 +1,1415 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vp8_loop_filter_horizontal_edge_mmi(
+ unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+ const unsigned char *limit, const unsigned char *thresh, int count) {
+ uint64_t tmp[1];
+ mips_reg addr[2];
+ double ftmp[12];
+ double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_01] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t"
+
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+ "gsldlc1 %[ftmp1], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[addr1]) \n\t"
+
+ MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+ "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t"
+ "psubusb %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp5], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp9], %[ftmp4], %[ftmp5] \n\t"
+ "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
+ "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t"
+ "psubusb %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp8], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[blimit]) \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+
+ "gsldlc1 %[ftmp10], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[thresh]) \n\t"
+ "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ "psubusb %[ftmp2], %[ftmp11], %[ftmp10] \n\t"
+ "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+
+ "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+
+ "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+ "psubsb %[ftmp3], %[ftmp6], %[ftmp5] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+
+ "paddsb %[ftmp8], %[ftmp2], %[ff_pb_03] \n\t"
+ "paddsb %[ftmp9], %[ftmp2], %[ff_pb_04] \n\t"
+
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t"
+
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t"
+ "packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
+ "punpckhbh %[ftmp9], %[ftmp11], %[ftmp9] \n\t"
+ "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t"
+ "paddsh %[ftmp11], %[ftmp0], %[ff_ph_01] \n\t"
+ "packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t"
+
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t"
+ "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t"
+ "packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
+ "pandn %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp5], 0x07(%[addr1]) \n\t"
+ "gssdrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
+ "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
+ "gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
+
+ "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+
+ "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
+ "gssdrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
+
+ "addiu %[count], %[count], -0x01 \n\t"
+ MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_fe]"=&f"(ff_pb_fe),
+ [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_04]"=&f"(ff_pb_04),
+ [ff_pb_03]"=&f"(ff_pb_03)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+ [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2))
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
+ int src_pixel_step,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh, int count) {
+ uint64_t tmp[1];
+ mips_reg addr[2];
+ double ftmp[13];
+ double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_01] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+ "1: \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+ MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
+ MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
+ "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+ "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp1], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t"
+
+ "gsldlc1 %[ftmp11], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t"
+ "punpcklbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp4], %[ftmp11], %[ftmp12] \n\t"
+
+ "punpcklhw %[ftmp5], %[ftmp4], %[ftmp2] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp4], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
+ "punpckhhw %[ftmp8], %[ftmp3], %[ftmp1] \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+ "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp0], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
+
+ "punpcklhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp2], %[ftmp11], %[ftmp10] \n\t"
+ "punpcklhw %[ftmp3], %[ftmp0], %[ftmp9] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp0], %[ftmp9] \n\t"
+
+ /* ftmp9:q0 ftmp10:q1 */
+ "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t"
+ "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t"
+ /* ftmp11:q2 ftmp12:q3 */
+ "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t"
+ "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t"
+ /* ftmp1:p3 ftmp2:p2 */
+ "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t"
+ "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t"
+ /* ftmp5:p1 ftmp6:p0 */
+ "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t"
+ "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t"
+
+ "gsldlc1 %[ftmp8], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[limit]) \n\t"
+
+ /* abs (q3-q2) */
+ "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
+ "psubusb %[ftmp0], %[ftmp7], %[ftmp8] \n\t"
+ /* abs (q2-q1) */
+ "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp3: abs(q1-q0) */
+ "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
+ "psubusb %[ftmp7], %[ftmp3], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp4: abs(p1-p0) */
+ "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+ "psubusb %[ftmp7], %[ftmp4], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p2-p1) */
+ "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p3-p2) */
+ "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+
+ "gsldlc1 %[ftmp8], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[blimit]) \n\t"
+
+ /* abs (p0-q0) */
+ "pasubub %[ftmp11], %[ftmp9], %[ftmp6] \n\t"
+ "paddusb %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
+ /* abs (p1-q1) */
+ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t"
+ "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp1] \n\t"
+ "psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t"
+ "paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pxor %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* ftmp0:mask */
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "gsldlc1 %[ftmp8], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[thresh]) \n\t"
+
+ /* ftmp3: abs(q1-q0) ftmp4: abs(p1-p0) */
+ "psubusb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
+ "psubusb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
+ "por %[ftmp2], %[ftmp4], %[ftmp3] \n\t"
+ "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* ftmp1:hev */
+ "pxor %[ftmp1], %[ftmp2], %[ftmp1] \n\t"
+
+ "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+
+ "psubsb %[ftmp2], %[ftmp5], %[ftmp10] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+ "psubsb %[ftmp3], %[ftmp9], %[ftmp6] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ /* ftmp2:filter_value */
+ "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+
+ "paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t"
+ "paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t"
+
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp7] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+ "packsshb %[ftmp12], %[ftmp0], %[ftmp8] \n\t"
+
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp11] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+ "packsshb %[ftmp11], %[ftmp0], %[ftmp8] \n\t"
+
+ "psubsb %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t"
+
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+ "packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t"
+ "pandn %[ftmp2], %[ftmp1], %[ftmp2] \n\t"
+ "psubsb %[ftmp10], %[ftmp10], %[ftmp2] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+
+ /* ftmp5: *op1 ; ftmp6: *op0 */
+ "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ /* ftmp9: *oq0 ; ftmp10: *oq1 */
+ "punpcklbh %[ftmp4], %[ftmp9], %[ftmp10] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp9], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp2], %[ftmp4] \n\t"
+ "punpcklhw %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t"
+
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "ssrld %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+ "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t"
+
+ "ssrld %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t"
+ "gsswlc1 %[ftmp1], 0x05(%[src_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x02(%[src_ptr]) \n\t"
+
+ "ssrld %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "gsswlc1 %[ftmp1], 0x05(%[addr0]) \n\t"
+ "gsswrc1 %[ftmp1], 0x02(%[addr0]) \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
+ "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t"
+
+ "ssrld %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+ "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t"
+
+ MMI_ADDIU(%[count], %[count], -0x01)
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_03]"=&f"(ff_pb_03),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_fe]"=&f"(ff_pb_fe)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_MBLOOP_HPSRAB \
+ "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" \
+ "psrah %[ftmp10], %[ftmp10], %[ftmp9] \n\t" \
+ "psrah %[ftmp11], %[ftmp11], %[ftmp9] \n\t" \
+ "packsshb %[ftmp0], %[ftmp10], %[ftmp11] \n\t"
+
+#define VP8_MBLOOP_HPSRAB_ADD(reg) \
+ "punpcklbh %[ftmp1], %[ftmp0], %[ftmp12] \n\t" \
+ "punpckhbh %[ftmp2], %[ftmp0], %[ftmp12] \n\t" \
+ "pmulhh %[ftmp1], %[ftmp1], " #reg " \n\t" \
+ "pmulhh %[ftmp2], %[ftmp2], " #reg " \n\t" \
+ "paddh %[ftmp1], %[ftmp1], %[ff_ph_003f] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_003f] \n\t" \
+ "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
+ "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" \
+ "packsshb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+/* clang-format on */
+
+void vp8_mbloop_filter_horizontal_edge_mmi(
+ unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+ const unsigned char *limit, const unsigned char *thresh, int count) {
+ uint64_t tmp[1];
+ double ftmp[13];
+ double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900,
+ ff_ph_1200, ff_ph_1b00;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
+ "dli %[tmp0], 0x003f003f003f003f \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_003f] \n\t"
+ "dli %[tmp0], 0x0900090009000900 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_0900] \n\t"
+ "dli %[tmp0], 0x1200120012001200 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_1200] \n\t"
+ "dli %[tmp0], 0x1b001b001b001b00 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_1b00] \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ "1: \n\t"
+ "gsldlc1 %[ftmp9], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp9], 0x00(%[limit]) \n\t"
+ /* ftmp1: p3 */
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ /* ftmp3: p2 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
+ /* ftmp4: p1 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
+ /* ftmp5: p0 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ /* ftmp6: q0 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ /* ftmp7: q1 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ /* ftmp8: q2 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+ /* ftmp2: q3 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp2], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[src_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp12], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[blimit]) \n\t"
+
+ "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t"
+ "psubusb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp10], %[ftmp4], %[ftmp5] \n\t"
+ "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t"
+ "psubusb %[ftmp1], %[ftmp11], %[ftmp9] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ /* ftmp0: mask */
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+
+ "gsldlc1 %[ftmp9], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp9], 0x00(%[thresh]) \n\t"
+ "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t"
+ "psubusb %[ftmp2], %[ftmp11], %[ftmp9] \n\t"
+ "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ /* ftmp1: hev */
+ "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+
+ "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "psubsb %[ftmp9], %[ftmp6], %[ftmp5] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t"
+ VP8_MBLOOP_HPSRAB
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "paddsb %[ftmp0], %[ftmp2], %[ff_pb_04] \n\t"
+ VP8_MBLOOP_HPSRAB
+ "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+
+ "dli %[tmp0], 0x07 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+
+ VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
+ "psubsb %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ "gssdlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+
+ VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
+ "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
+ "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
+
+ VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
+ "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+ "psubsb %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ "gssdlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
+
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+ "addiu %[count], %[count], -0x01 \n\t"
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_03]"=&f"(ff_pb_03),
+ [ff_ph_0900]"=&f"(ff_ph_0900), [ff_ph_1b00]"=&f"(ff_ph_1b00),
+ [ff_ph_1200]"=&f"(ff_ph_1200), [ff_ph_003f]"=&f"(ff_ph_003f)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_MBLOOP_VPSRAB_ADDH \
+ "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
+
+#define VP8_MBLOOP_VPSRAB_ADDT \
+ "paddh %[ftmp7], %[ftmp7], %[ff_ph_003f] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ff_ph_003f] \n\t" \
+ "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \
+ "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \
+ "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t"
+/* clang-format on */
+
+void vp8_mbloop_filter_vertical_edge_mmi(
+ unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+ const unsigned char *limit, const unsigned char *thresh, int count) {
+ mips_reg tmp[1];
+ DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+ double ftmp[14];
+ double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x003f003f003f003f \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_003f] \n\t"
+ "dli %[tmp0], 0x0900090009000900 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_0900] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
+
+ "punpcklhw %[ftmp1], %[ftmp12], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp2], %[ftmp12], %[ftmp10] \n\t"
+ "punpcklhw %[ftmp3], %[ftmp11], %[ftmp9] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp11], %[ftmp9] \n\t"
+
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
+
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp10] \n\t"
+ "punpcklhw %[ftmp7], %[ftmp11], %[ftmp9] \n\t"
+ "punpckhhw %[ftmp8], %[ftmp11], %[ftmp9] \n\t"
+
+ "gsldlc1 %[ftmp13], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp13], 0x00(%[limit]) \n\t"
+ /* ftmp9:q0 ftmp10:q1 */
+ "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t"
+ "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t"
+ /* ftmp11:q2 ftmp12:q3 */
+ "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t"
+ "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t"
+ /* srct[0x00]: q3 */
+ "sdc1 %[ftmp12], 0x00(%[srct]) \n\t"
+ /* ftmp1:p3 ftmp2:p2 */
+ "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t"
+ "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t"
+ /* srct[0x08]: p3 */
+ "sdc1 %[ftmp1], 0x08(%[srct]) \n\t"
+ /* ftmp5:p1 ftmp6:p0 */
+ "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t"
+ "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t"
+
+ /* abs (q3-q2) */
+ "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
+ "psubusb %[ftmp0], %[ftmp7], %[ftmp13] \n\t"
+ /* abs (q2-q1) */
+ "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp3: abs(q1-q0) */
+ "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
+ "psubusb %[ftmp7], %[ftmp3], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp4: abs(p1-p0) */
+ "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+ "psubusb %[ftmp7], %[ftmp4], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p2-p1) */
+ "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p3-p2) */
+ "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+
+ "gsldlc1 %[ftmp13], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp13], 0x00(%[blimit]) \n\t"
+ "gsldlc1 %[ftmp7], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[thresh]) \n\t"
+ /* abs (p0-q0) * 2 */
+ "pasubub %[ftmp1], %[ftmp9], %[ftmp6] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* abs (p1-q1) / 2 */
+ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t"
+ "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
+ "psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t"
+ "paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t"
+ "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
+ "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ /* ftmp0: mask */
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
+
+ /* abs(p1-p0) - thresh */
+ "psubusb %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
+ /* abs(q1-q0) - thresh */
+ "psubusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+ "por %[ftmp3], %[ftmp4], %[ftmp3] \n\t"
+ "pcmpeqb %[ftmp3], %[ftmp3], %[ftmp12] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* ftmp1: hev */
+ "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
+
+ /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
+ "pxor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t"
+
+ "psubsb %[ftmp3], %[ftmp5], %[ftmp10] \n\t"
+ "psubsb %[ftmp4], %[ftmp9], %[ftmp6] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ /* filter_value &= mask */
+ "pand %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ /* Filter2 = filter_value & hev */
+ "pand %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ /* filter_value &= ~hev */
+ "pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t"
+
+ "paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+ "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
+ "packsshb %[ftmp4], %[ftmp7], %[ftmp8] \n\t"
+ /* ftmp9: qs0 */
+ "psubsb %[ftmp9], %[ftmp9], %[ftmp4] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ff_pb_03] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp3] \n\t"
+ "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
+ "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t"
+ /* ftmp6: ps0 */
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
+
+ "dli %[tmp0], 0x07 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDH
+ "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t"
+ "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDT
+ "psubsb %[ftmp4], %[ftmp9], %[ftmp3] \n\t"
+ /* ftmp9: oq0 */
+ "pxor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp4], %[ftmp6], %[ftmp3] \n\t"
+ /* ftmp6: op0 */
+ "pxor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t"
+
+ VP8_MBLOOP_VPSRAB_ADDH
+ "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t"
+ "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDT
+ "psubsb %[ftmp4], %[ftmp10], %[ftmp3] \n\t"
+ /* ftmp10: oq1 */
+ "pxor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp4], %[ftmp5], %[ftmp3] \n\t"
+ /* ftmp5: op1 */
+ "pxor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t"
+
+ VP8_MBLOOP_VPSRAB_ADDH
+ "pmulhh %[ftmp7], %[ftmp7], %[ff_ph_0900] \n\t"
+ "pmulhh %[ftmp8], %[ftmp8], %[ff_ph_0900] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDT
+ "psubsb %[ftmp4], %[ftmp11], %[ftmp3] \n\t"
+ /* ftmp11: oq2 */
+ "pxor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp4], %[ftmp2], %[ftmp3] \n\t"
+ /* ftmp2: op2 */
+ "pxor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t"
+
+ "ldc1 %[ftmp12], 0x00(%[srct]) \n\t"
+ "ldc1 %[ftmp8], 0x08(%[srct]) \n\t"
+
+ "punpcklbh %[ftmp0], %[ftmp8], %[ftmp2] \n\t"
+ "punpckhbh %[ftmp1], %[ftmp8], %[ftmp2] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp5], %[ftmp6] \n\t"
+ "punpcklhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp0], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
+ "punpckhhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
+
+ "punpcklbh %[ftmp0], %[ftmp9], %[ftmp10] \n\t"
+ "punpckhbh %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t"
+ "punpcklhw %[ftmp8], %[ftmp0], %[ftmp2] \n\t"
+ "punpckhhw %[ftmp9], %[ftmp0], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp10], %[ftmp1], %[ftmp3] \n\t"
+ "punpckhhw %[ftmp11], %[ftmp1], %[ftmp3] \n\t"
+
+ "punpcklwd %[ftmp0], %[ftmp7], %[ftmp11] \n\t"
+ "punpckhwd %[ftmp1], %[ftmp7], %[ftmp11] \n\t"
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklwd %[ftmp0], %[ftmp6], %[ftmp10] \n\t"
+ "punpckhwd %[ftmp1], %[ftmp6], %[ftmp10] \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklwd %[ftmp1], %[ftmp5], %[ftmp9] \n\t"
+ "punpckhwd %[ftmp0], %[ftmp5], %[ftmp9] \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklwd %[ftmp1], %[ftmp4], %[ftmp8] \n\t"
+ "punpckhwd %[ftmp0], %[ftmp4], %[ftmp8] \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ "addiu %[count], %[count], -0x01 \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
+ [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr),
+ [count]"+&r"(count),
+ [ff_ph_003f]"=&f"(ff_ph_003f), [ff_ph_0900]"=&f"(ff_ph_0900),
+ [ff_pb_03]"=&f"(ff_pb_03), [ff_pb_04]"=&f"(ff_pb_04),
+ [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_fe]"=&f"(ff_pb_fe)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [srct]"r"(srct), [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_SIMPLE_HPSRAB \
+ "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \
+ "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \
+ "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" \
+ "psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \
+ "psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+/* clang-format on */
+
+void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
+ int src_pixel_step,
+ const unsigned char *blimit) {
+ uint64_t tmp[1], count = 2;
+ mips_reg addr[2];
+ double ftmp[12];
+ double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+ "dli %[tmp0], 0x08 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
+ "dli %[tmp0], 0x03 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0101010101010101 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_01] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[blimit]) \n\t"
+
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t"
+ "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
+ "pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t"
+ "pand %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+ "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ "pasubub %[ftmp5], %[ftmp6], %[ftmp0] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "psubusb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
+
+ "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp3], %[ftmp0], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp0], %[ftmp3], %[ftmp6] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "pand %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+ "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t"
+ VP8_SIMPLE_HPSRAB
+ "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
+
+ "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t"
+ VP8_SIMPLE_HPSRAB
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp6], 0x07(%[addr1]) \n\t"
+ "gssdrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+
+ "addiu %[count], %[count], -0x01 \n\t"
+ MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01)
+ : [blimit]"r"(blimit),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1))
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
+ int src_pixel_step,
+ const unsigned char *blimit) {
+ uint64_t tmp[1], count = 2;
+ mips_reg addr[2];
+ DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+ double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x08 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
+ "dli %[tmp0], 0x20 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0x08 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
+ "dli %[tmp0], 0x20 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0101010101010101 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_01] \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
+ MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
+
+ "1: \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+ "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ "gslwlc1 %[ftmp4], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
+ "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gslwlc1 %[ftmp7], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp7], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
+
+ MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+ "gslwlc1 %[ftmp1], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp1], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+ "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t"
+ "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
+ "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
+ "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
+ "punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t"
+ "punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
+
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
+ "pand %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t"
+ "psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
+ "pasubub %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
+
+ "gsldlc1 %[ftmp7], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[blimit]) \n\t"
+ "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
+ "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+
+ "sdc1 %[ftmp0], 0x00(%[srct]) \n\t"
+ "sdc1 %[ftmp3], 0x08(%[srct]) \n\t"
+
+ "pxor %[ftmp0], %[ftmp0], %[ff_pb_80] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+
+ "pxor %[ftmp6], %[ftmp1], %[ff_pb_80] \n\t"
+ "pxor %[ftmp3], %[ftmp2], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp7], %[ftmp3], %[ftmp6] \n\t"
+ "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "pand %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t"
+
+ "dli %[tmp0], 0x03 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
+
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t"
+ "psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t"
+
+ "dli %[tmp0], 0x03 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
+
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
+ "psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp5] \n\t"
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+
+ "ldc1 %[ftmp0], 0x00(%[srct]) \n\t"
+ "ldc1 %[ftmp4], 0x08(%[srct]) \n\t"
+
+ "punpckhbh %[ftmp1], %[ftmp0], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp3], %[ftmp4] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+
+ "punpckhhw %[ftmp6], %[ftmp0], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+ "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+
+ "ssrld %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+ "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+
+ "ssrld %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[src_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+
+ "ssrld %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[addr0]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
+
+ "ssrld %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+ "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8])
+ "addiu %[count], %[count], -0x01 \n\t"
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01)
+ : [blimit]"r"(blimit), [srct]"r"(srct),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+ [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
+ [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3))
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride,
+ blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride,
+ blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+ blimit);
+}
+
+void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit);
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
new file mode 100644
index 0000000000..b85f73fdff
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -0,0 +1,427 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/filter.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = {
+ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+ 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+ 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+ { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
+ 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+ 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+ 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+ 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+ 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 },
+ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+ 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+ 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+ 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+ { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003,
+ 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+ 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+ 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+ 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+ 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 },
+ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+ 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+ 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+ 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+ { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
+ 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+ 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+ 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+ 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+ 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 },
+ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+ 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+ 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }
+};
+
+/* Horizontal filter: pixel_step is 1, output_height and output_width are
+ the size of horizontal filtering output, output_height is always H + 5 */
+static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
+ uint16_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp8_filter) {
+ uint64_t tmp[1];
+ double ff_ph_40;
+#if _MIPS_SIM == _ABIO32
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f2");
+ register double ftmp1 asm("$f4");
+ register double ftmp2 asm("$f6");
+ register double ftmp3 asm("$f8");
+ register double ftmp4 asm("$f10");
+ register double ftmp5 asm("$f12");
+ register double ftmp6 asm("$f14");
+ register double ftmp7 asm("$f16");
+ register double ftmp8 asm("$f18");
+ register double ftmp9 asm("$f20");
+ register double ftmp10 asm("$f22");
+ register double ftmp11 asm("$f24");
+#else
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f1");
+ register double ftmp1 asm("$f2");
+ register double ftmp2 asm("$f3");
+ register double ftmp3 asm("$f4");
+ register double ftmp4 asm("$f5");
+ register double ftmp5 asm("$f6");
+ register double ftmp6 asm("$f7");
+ register double ftmp7 asm("$f8");
+ register double ftmp8 asm("$f9");
+ register double ftmp9 asm("$f10");
+ register double ftmp10 asm("$f11");
+ register double ftmp11 asm("$f12");
+#endif // _MIPS_SIM == _ABIO32
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x0040004000400040 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_40] \n\t"
+ "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t"
+ "pxor %[fzero], %[fzero], %[fzero] \n\t"
+ "dli %[tmp0], 0x07 \n\t"
+ "dmtc1 %[tmp0], %[ftmp7] \n\t"
+ "dli %[tmp0], 0x08 \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp9], -0x02(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp10], 0x06(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp10], -0x01(%[src_ptr]) \n\t"
+
+ "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t"
+ "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t"
+
+ "punpckhbh %[ftmp6], %[ftmp9], %[fzero] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
+
+ "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
+
+ "punpckhbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
+
+ "ssrld %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
+
+ "ssrld %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
+
+ "paddsh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+ "packushb %[ftmp8], %[ftmp8], %[fzero] \n\t"
+ "punpcklbh %[ftmp8], %[ftmp8], %[fzero] \n\t"
+ "gssdlc1 %[ftmp8], 0x07(%[output_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x00(%[output_ptr]) \n\t"
+
+ "addiu %[output_height], %[output_height], -0x01 \n\t"
+ MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width])
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line])
+ "bnez %[output_height], 1b \n\t"
+ : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
+ [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2),
+ [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4),
+ [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6),
+ [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8),
+ [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10),
+ [ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]),
+ [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height),
+ [src_ptr]"+&r"(src_ptr), [ff_ph_40]"=&f"(ff_ph_40)
+ : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+ [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+/* Horizontal filter: pixel_step is always W */
+static INLINE void vp8_filter_block1dc_v6_mmi(
+ uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+ int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
+ double ff_ph_40;
+ uint64_t tmp[1];
+ mips_reg addr[1];
+
+#if _MIPS_SIM == _ABIO32
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f2");
+ register double ftmp1 asm("$f4");
+ register double ftmp2 asm("$f6");
+ register double ftmp3 asm("$f8");
+ register double ftmp4 asm("$f10");
+ register double ftmp5 asm("$f12");
+ register double ftmp6 asm("$f14");
+ register double ftmp7 asm("$f16");
+ register double ftmp8 asm("$f18");
+ register double ftmp9 asm("$f20");
+ register double ftmp10 asm("$f22");
+ register double ftmp11 asm("$f24");
+ register double ftmp12 asm("$f26");
+ register double ftmp13 asm("$f28");
+#else
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f1");
+ register double ftmp1 asm("$f2");
+ register double ftmp2 asm("$f3");
+ register double ftmp3 asm("$f4");
+ register double ftmp4 asm("$f5");
+ register double ftmp5 asm("$f6");
+ register double ftmp6 asm("$f7");
+ register double ftmp7 asm("$f8");
+ register double ftmp8 asm("$f9");
+ register double ftmp9 asm("$f10");
+ register double ftmp10 asm("$f11");
+ register double ftmp11 asm("$f12");
+ register double ftmp12 asm("$f13");
+ register double ftmp13 asm("$f14");
+#endif // _MIPS_SIM == _ABIO32
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x0040004000400040 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_40] \n\t"
+ "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t"
+ "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t"
+ "pxor %[fzero], %[fzero], %[fzero] \n\t"
+ "dli %[tmp0], 0x07 \n\t"
+ "dmtc1 %[tmp0], %[ftmp13] \n\t"
+
+ /* In order to make full use of memory load delay slot,
+ * Operation of memory loading and calculating has been rearranged.
+ */
+ "1: \n\t"
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line])
+ "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2])
+ "gsldlc1 %[ftmp8], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[addr0]) \n\t"
+
+ MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4])
+ "gsldlc1 %[ftmp9], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp9], 0x00(%[addr0]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line])
+ MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2])
+ "gsldlc1 %[ftmp10], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[addr0]) \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4])
+ "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t"
+
+ "pmullh %[ftmp12], %[ftmp6], %[ftmp0] \n\t"
+
+ "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "paddsh %[ftmp12], %[ftmp12], %[ftmp7] \n\t"
+
+ "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
+ "paddsh %[ftmp12], %[ftmp12], %[ftmp8] \n\t"
+
+ "pmullh %[ftmp9], %[ftmp9], %[ftmp4] \n\t"
+ "paddsh %[ftmp12], %[ftmp12], %[ftmp9] \n\t"
+
+ "pmullh %[ftmp10], %[ftmp10], %[ftmp3] \n\t"
+ "paddsh %[ftmp12], %[ftmp12], %[ftmp10] \n\t"
+
+ "pmullh %[ftmp11], %[ftmp11], %[ftmp5] \n\t"
+ "paddsh %[ftmp12], %[ftmp12], %[ftmp11] \n\t"
+
+ "paddsh %[ftmp12], %[ftmp12], %[ff_ph_40] \n\t"
+ "psrah %[ftmp12], %[ftmp12], %[ftmp13] \n\t"
+ "packushb %[ftmp12], %[ftmp12], %[fzero] \n\t"
+ "gsswlc1 %[ftmp12], 0x03(%[output_ptr]) \n\t"
+ "gsswrc1 %[ftmp12], 0x00(%[output_ptr]) \n\t"
+
+ MMI_ADDIU(%[output_height], %[output_height], -0x01)
+ MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+ "bnez %[output_height], 1b \n\t"
+ : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
+ [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2),
+ [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4),
+ [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6),
+ [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8),
+ [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10),
+ [ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12),
+ [ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr),
+ [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height),
+ [ff_ph_40]"=&f"(ff_ph_40)
+ : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+ [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
+ [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
+ [vp8_filter]"r"(vp8_filter),
+ [output_pitch]"r"((mips_reg)output_pitch)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
+ function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can
+ be simplified */
+static INLINE void vp8_filter_block1d_h6_filter0_mmi(
+ unsigned char *src_ptr, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, unsigned int output_height,
+ unsigned int output_width) {
+#if _MIPS_SIM == _ABIO32
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f2");
+ register double ftmp1 asm("$f4");
+#else
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f1");
+ register double ftmp1 asm("$f2");
+#endif // _MIPS_SIM == _ABIO32
+
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[fzero], %[fzero], %[fzero] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line])
+
+ "punpcklbh %[ftmp1], %[ftmp0], %[fzero] \n\t"
+ "gssdlc1 %[ftmp1], 0x07(%[output_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t"
+
+ "addiu %[output_height], %[output_height], -0x01 \n\t"
+ MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width])
+ "bnez %[output_height], 1b \n\t"
+ : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
+ [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr),
+ [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height)
+ : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+ [output_width]"r"(output_width)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
+ uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+ int output_pitch, unsigned int pixels_per_line) {
+#if _MIPS_SIM == _ABIO32
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f2");
+ register double ftmp1 asm("$f4");
+#else
+ register double fzero asm("$f0");
+ register double ftmp0 asm("$f1");
+ register double ftmp1 asm("$f2");
+#endif // _MIPS_SIM == _ABIO32
+
+ /* clang-format on */
+ __asm__ volatile (
+ "pxor %[fzero], %[fzero], %[fzero] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line])
+ MMI_ADDIU(%[output_height], %[output_height], -0x01)
+ "packushb %[ftmp1], %[ftmp0], %[fzero] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[output_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t"
+
+ MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+ "bnez %[output_height], 1b \n\t"
+ : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
+ [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr),
+ [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height)
+ : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+ [output_pitch]"r"((mips_reg)output_pitch)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+#define sixtapNxM(n, m) \
+ void vp8_sixtap_predict##n##x##m##_mmi( \
+ unsigned char *src_ptr, int src_pixels_per_line, int xoffset, \
+ int yoffset, unsigned char *dst_ptr, int dst_pitch) { \
+ DECLARE_ALIGNED(16, uint16_t, \
+ FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]); \
+ const int16_t *HFilter, *VFilter; \
+ int i, loop = n / 4; \
+ HFilter = vp8_six_tap_mmi[xoffset]; \
+ VFilter = vp8_six_tap_mmi[yoffset]; \
+ \
+ if (xoffset == 0) { \
+ for (i = 0; i < loop; ++i) { \
+ vp8_filter_block1d_h6_filter0_mmi( \
+ src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4, \
+ src_pixels_per_line, m + 5, n * 2); \
+ } \
+ } else { \
+ for (i = 0; i < loop; ++i) { \
+ vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \
+ FData2 + i * 4, src_pixels_per_line, m + 5, \
+ n * 2, HFilter); \
+ } \
+ } \
+ if (yoffset == 0) { \
+ for (i = 0; i < loop; ++i) { \
+ vp8_filter_block1dc_v6_filter0_mmi( \
+ FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2); \
+ } \
+ } else { \
+ for (i = 0; i < loop; ++i) { \
+ vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m, \
+ dst_pitch, n * 2, VFilter); \
+ } \
+ } \
+ }
+
+sixtapNxM(4, 4);
+sixtapNxM(8, 8);
+sixtapNxM(8, 4);
+sixtapNxM(16, 16);
diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c
new file mode 100644
index 0000000000..c7fb1ed33f
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/filter.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+DECLARE_ALIGNED(16, static const int8_t, vp8_bilinear_filters_msa[7][2]) = {
+ { 112, 16 }, { 96, 32 }, { 80, 48 }, { 64, 64 },
+ { 48, 80 }, { 32, 96 }, { 16, 112 }
+};
+
+static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, vec0, vec1, res0, res1;
+ v8u16 vec2, vec3, filt;
+
+ mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+ SRARI_H2_UH(vec2, vec3, VP8_FILTER_SHIFT);
+ PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ v16u8 vec0, vec1, vec2, vec3, filt0;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16i8 res0, res1, res2, res3;
+ v8u16 vec4, vec5, vec6, vec7, filt;
+
+ mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+ vec6, vec7);
+ SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
+ PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+ res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ v16u8 filt0;
+ v16i8 src0, src1, src2, src3, mask;
+ v8u16 vec0, vec1, vec2, vec3, filt;
+
+ mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+ ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ v16u8 filt0;
+ v16i8 src0, src1, src2, src3, mask, out0, out1;
+ v8u16 vec0, vec1, vec2, vec3, filt;
+
+ mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ if (16 == height) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+ }
+}
+
+static void common_hz_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+ mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+ loop_cnt = (height >> 2) - 1;
+
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+ SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+ PCKEV_ST_SB(out0, out1, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out2, out3, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out4, out5, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out6, out7, dst);
+ dst += dst_stride;
+
+ for (; loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+ SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+ PCKEV_ST_SB(out0, out1, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out2, out3, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out4, out5, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out6, out7, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_vt_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4;
+ v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+ v16u8 filt0;
+ v8i16 filt;
+ v8u16 tmp0, tmp1;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+ src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt0;
+ v8i16 filt;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ src8 = LD_SB(src);
+ src += src_stride;
+
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+ src76_r, src87_r);
+ ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+ src76_r, src2110, src4332, src6554, src8776);
+ DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+ ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+ ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_vt_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+ v16i8 out0, out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+ ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v16i8 out0, out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+ src += (8 * src_stride);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+ vec3);
+ ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
+ vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_vt_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+ dst += dst_stride;
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
+ PCKEV_ST_SB(tmp2, tmp3, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
+ PCKEV_ST_SB(tmp2, tmp3, dst);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+ mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+ filt = LD_UH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+ filt = LD_UH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+ PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ v16i8 res0, res1, res2, res3;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+ mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+ filt = LD_UH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+ filt = LD_UH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ src8 = LD_SB(src);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, VP8_FILTER_SHIFT);
+ SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+ hz_out3, hz_out5, 8);
+ hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
+ vec5, vec6, vec7);
+ SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
+ PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+ res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else if (8 == height) {
+ common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+ vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
+ vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(
+ uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+ int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+ v16u8 filt_hz, filt_vt, vec0;
+ v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ v8i16 filt;
+
+ mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_SB(src);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp3, tmp4, VP8_FILTER_SHIFT);
+ PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, VP8_FILTER_SHIFT);
+ PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hv_2ht_2vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else {
+ common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ }
+}
+
+static void common_hv_2ht_2vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+ v8i16 filt;
+
+ mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, VP8_FILTER_SHIFT);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, VP8_FILTER_SHIFT);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, VP8_FILTER_SHIFT);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+ }
+}
+
+void vp8_bilinear_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ uint8_t *RESTRICT dst, int32_t dst_stride) {
+ const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+ const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+ if (yoffset) {
+ if (xoffset) {
+ common_hv_2ht_2vt_4w_msa(src, src_stride, dst, dst_stride, h_filter,
+ v_filter, 4);
+ } else {
+ common_vt_2t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
+ }
+ } else {
+ if (xoffset) {
+ common_hz_2t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
+ } else {
+ uint32_t tp0, tp1, tp2, tp3;
+
+ LW4(src, src_stride, tp0, tp1, tp2, tp3);
+ SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
+ }
+ }
+}
+
+void vp8_bilinear_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ uint8_t *RESTRICT dst, int32_t dst_stride) {
+ const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+ const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+ if (yoffset) {
+ if (xoffset) {
+ common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter,
+ v_filter, 4);
+ } else {
+ common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
+ }
+ } else {
+ if (xoffset) {
+ common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
+ } else {
+ vp8_copy_mem8x4(src, src_stride, dst, dst_stride);
+ }
+ }
+}
+
+void vp8_bilinear_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ uint8_t *RESTRICT dst, int32_t dst_stride) {
+ const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+ const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+ if (yoffset) {
+ if (xoffset) {
+ common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter,
+ v_filter, 8);
+ } else {
+ common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
+ }
+ } else {
+ if (xoffset) {
+ common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
+ } else {
+ vp8_copy_mem8x8(src, src_stride, dst, dst_stride);
+ }
+ }
+}
+
+void vp8_bilinear_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ uint8_t *RESTRICT dst, int32_t dst_stride) {
+ const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+ const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+ if (yoffset) {
+ if (xoffset) {
+ common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, h_filter,
+ v_filter, 16);
+ } else {
+ common_vt_2t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16);
+ }
+ } else {
+ if (xoffset) {
+ common_hz_2t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16);
+ } else {
+ vp8_copy_mem16x16(src, src_stride, dst, dst_stride);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/copymem_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/copymem_msa.c
new file mode 100644
index 0000000000..357c99b8b6
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/msa/copymem_msa.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void copy_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride) {
+ uint64_t src0, src1, src2, src3;
+
+ LD4(src, src_stride, src0, src1, src2, src3);
+ SD4(src0, src1, src2, src3, dst, dst_stride);
+}
+
+static void copy_8x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride) {
+ uint64_t src0, src1, src2, src3;
+
+ LD4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ SD4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ LD4(src, src_stride, src0, src1, src2, src3);
+ SD4(src0, src1, src2, src3, dst, dst_stride);
+}
+
+static void copy_16x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride) {
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ LD_UB8(src, src_stride, src8, src9, src10, src11, src12, src13, src14, src15);
+
+ ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+ dst += (8 * dst_stride);
+ ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, dst_stride);
+}
+
+void vp8_copy_mem16x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride) {
+ copy_16x16_msa(src, src_stride, dst, dst_stride);
+}
+
+void vp8_copy_mem8x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride) {
+ copy_8x8_msa(src, src_stride, dst, dst_stride);
+}
+
+void vp8_copy_mem8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride) {
+ copy_8x4_msa(src, src_stride, dst, dst_stride);
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c
new file mode 100644
index 0000000000..efad0c29f8
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static const int32_t cospi8sqrt2minus1 = 20091;
+static const int32_t sinpi8sqrt2 = 35468;
+
+#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 s4_m, s5_m, s6_m, s7_m; \
+ \
+ TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m); \
+ ILVR_D2_SH(s6_m, s4_m, s7_m, s5_m, out0, out2); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)s6_m, (v2i64)s4_m); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)s7_m, (v2i64)s5_m); \
+ }
+
+#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in) \
+ ({ \
+ v8i16 out_m; \
+ v8i16 zero_m = { 0 }; \
+ v4i32 tmp1_m, tmp2_m; \
+ v4i32 sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2); \
+ \
+ ILVRL_H2_SW(in, zero_m, tmp1_m, tmp2_m); \
+ tmp1_m >>= 16; \
+ tmp2_m >>= 16; \
+ tmp1_m = (tmp1_m * sinpi8_sqrt2_m) >> 16; \
+ tmp2_m = (tmp2_m * sinpi8_sqrt2_m) >> 16; \
+ out_m = __msa_pckev_h((v8i16)tmp2_m, (v8i16)tmp1_m); \
+ \
+ out_m; \
+ })
+
+#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 a1_m, b1_m, c1_m, d1_m; \
+ v8i16 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \
+ v8i16 const_cospi8sqrt2minus1_m; \
+ \
+ const_cospi8sqrt2minus1_m = __msa_fill_h(cospi8sqrt2minus1); \
+ a1_m = in0 + in2; \
+ b1_m = in0 - in2; \
+ c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1); \
+ c_tmp2_m = __msa_mul_q_h(in3, const_cospi8sqrt2minus1_m); \
+ c_tmp2_m = c_tmp2_m >> 1; \
+ c_tmp2_m = in3 + c_tmp2_m; \
+ c1_m = c_tmp1_m - c_tmp2_m; \
+ d_tmp1_m = __msa_mul_q_h(in1, const_cospi8sqrt2minus1_m); \
+ d_tmp1_m = d_tmp1_m >> 1; \
+ d_tmp1_m = in1 + d_tmp1_m; \
+ d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3); \
+ d1_m = d_tmp1_m + d_tmp2_m; \
+ BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+ }
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v4i32 a1_m, b1_m, c1_m, d1_m; \
+ v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \
+ v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m; \
+ \
+ const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1); \
+ sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2); \
+ a1_m = in0 + in2; \
+ b1_m = in0 - in2; \
+ c_tmp1_m = (in1 * sinpi8_sqrt2_m) >> 16; \
+ c_tmp2_m = in3 + ((in3 * const_cospi8sqrt2minus1_m) >> 16); \
+ c1_m = c_tmp1_m - c_tmp2_m; \
+ d_tmp1_m = in1 + ((in1 * const_cospi8sqrt2minus1_m) >> 16); \
+ d_tmp2_m = (in3 * sinpi8_sqrt2_m) >> 16; \
+ d1_m = d_tmp1_m + d_tmp2_m; \
+ BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+ }
+
+static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred,
+ int32_t pred_stride, uint8_t *dest,
+ int32_t dest_stride) {
+ v8i16 input0, input1;
+ v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+ v4i32 res0, res1, res2, res3;
+ v16i8 zero = { 0 };
+ v16i8 pred0, pred1, pred2, pred3;
+
+ LD_SH2(input, 8, input0, input1);
+ UNPCK_SH_SW(input0, in0, in1);
+ UNPCK_SH_SW(input1, in2, in3);
+ VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+ TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+ VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+ SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+ TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+ LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
+ ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
+ res2, res3);
+ ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1, res2,
+ res3);
+ ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+ res0 = CLIP_SW_0_255(res0);
+ res1 = CLIP_SW_0_255(res1);
+ res2 = CLIP_SW_0_255(res2);
+ res3 = CLIP_SW_0_255(res3);
+ PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+ res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+ ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride);
+}
+
+static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
+ int32_t pred_stride, uint8_t *dest,
+ int32_t dest_stride) {
+ v8i16 vec, res0, res1, res2, res3, dst0, dst1;
+ v16i8 zero = { 0 };
+ v16i8 pred0, pred1, pred2, pred3;
+
+ vec = __msa_fill_h(in_dc);
+ vec = __msa_srari_h(vec, 3);
+ LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
+ ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
+ res2, res3);
+ ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+ CLIP_SH4_0_255(res0, res1, res2, res3);
+ PCKEV_B2_SH(res1, res0, res3, res2, dst0, dst1);
+ dst0 = (v8i16)__msa_pckev_w((v4i32)dst1, (v4i32)dst0);
+ ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dest, dest_stride);
+}
+
+void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dqcoeff) {
+ v8i16 input0, input1, tmp0, tmp1, tmp2, tmp3, out0, out1;
+ const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+ const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+ const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+ const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+
+ LD_SH2(input, 8, input0, input1);
+ input1 = (v8i16)__msa_sldi_b((v16i8)input1, (v16i8)input1, 8);
+ tmp0 = input0 + input1;
+ tmp1 = input0 - input1;
+ VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+ out0 = tmp2 + tmp3;
+ out1 = tmp2 - tmp3;
+ VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1);
+ tmp0 = input0 + input1;
+ tmp1 = input0 - input1;
+ VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+ tmp0 = tmp2 + tmp3;
+ tmp1 = tmp2 - tmp3;
+ ADD2(tmp0, 3, tmp1, 3, out0, out1);
+ out0 >>= 3;
+ out1 >>= 3;
+ mb_dqcoeff[0] = __msa_copy_s_h(out0, 0);
+ mb_dqcoeff[16] = __msa_copy_s_h(out0, 4);
+ mb_dqcoeff[32] = __msa_copy_s_h(out1, 0);
+ mb_dqcoeff[48] = __msa_copy_s_h(out1, 4);
+ mb_dqcoeff[64] = __msa_copy_s_h(out0, 1);
+ mb_dqcoeff[80] = __msa_copy_s_h(out0, 5);
+ mb_dqcoeff[96] = __msa_copy_s_h(out1, 1);
+ mb_dqcoeff[112] = __msa_copy_s_h(out1, 5);
+ mb_dqcoeff[128] = __msa_copy_s_h(out0, 2);
+ mb_dqcoeff[144] = __msa_copy_s_h(out0, 6);
+ mb_dqcoeff[160] = __msa_copy_s_h(out1, 2);
+ mb_dqcoeff[176] = __msa_copy_s_h(out1, 6);
+ mb_dqcoeff[192] = __msa_copy_s_h(out0, 3);
+ mb_dqcoeff[208] = __msa_copy_s_h(out0, 7);
+ mb_dqcoeff[224] = __msa_copy_s_h(out1, 3);
+ mb_dqcoeff[240] = __msa_copy_s_h(out1, 7);
+}
+
+static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
+ uint8_t *dest, int32_t dest_stride) {
+ v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1;
+ v8i16 in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h;
+ v16u8 dest0, dest1, dest2, dest3;
+ v4i32 hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+ v2i64 zero = { 0 };
+
+ LD_SH2(input, 8, input0, input1);
+ LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
+ MUL2(input0, dequant_in0, input1, dequant_in1, mul0, mul1);
+ PCKEV_D2_SH(zero, mul0, zero, mul1, in0, in2);
+ PCKOD_D2_SH(zero, mul0, zero, mul1, in1, in3);
+ VP8_IDCT_1D_H(in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h);
+ PCKEV_D2_SH(hz1_h, hz0_h, hz3_h, hz2_h, mul0, mul1);
+ UNPCK_SH_SW(mul0, hz0_w, hz1_w);
+ UNPCK_SH_SW(mul1, hz2_w, hz3_w);
+ TRANSPOSE4x4_SW_SW(hz0_w, hz1_w, hz2_w, hz3_w, hz0_w, hz1_w, hz2_w, hz3_w);
+ VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3);
+ SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+ TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+ LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+ ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
+ res2, res3);
+ ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1, res2,
+ res3);
+ ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+ res0 = CLIP_SW_0_255(res0);
+ res1 = CLIP_SW_0_255(res1);
+ res2 = CLIP_SW_0_255(res2);
+ res3 = CLIP_SW_0_255(res3);
+ PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+ res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+ ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride);
+}
+
+static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
+ int16_t *dequant_input, uint8_t *dest,
+ int32_t dest_stride) {
+ v16u8 dest0, dest1, dest2, dest3;
+ v8i16 in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+ v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+ v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
+ v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
+ v16i8 zero = { 0 };
+
+ LD_SH4(input, 8, in0, in1, in2, in3);
+ LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
+ MUL4(in0, dequant_in0, in1, dequant_in1, in2, dequant_in0, in3, dequant_in1,
+ mul0, mul1, mul2, mul3);
+ PCKEV_D2_SH(mul2, mul0, mul3, mul1, in0, in2);
+ PCKOD_D2_SH(mul2, mul0, mul3, mul1, in1, in3);
+ VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+ TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+ UNPCK_SH_SW(hz0, hz0r, hz0l);
+ UNPCK_SH_SW(hz1, hz1r, hz1l);
+ UNPCK_SH_SW(hz2, hz2r, hz2l);
+ UNPCK_SH_SW(hz3, hz3r, hz3l);
+ VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
+ SRARI_W4_SW(vt0l, vt1l, vt2l, vt3l, 3);
+ VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
+ SRARI_W4_SW(vt0r, vt1r, vt2r, vt3r, 3);
+ PCKEV_H4_SH(vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r, vt0, vt1, vt2,
+ vt3);
+ TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+ LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+ ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
+ res2, res3);
+ ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+ CLIP_SH4_0_255(res0, res1, res2, res3);
+ PCKEV_B2_SW(res1, res0, res3, res2, vt0l, vt1l);
+ ST8x4_UB(vt0l, vt1l, dest, dest_stride);
+
+ __asm__ __volatile__(
+ "sw $zero, 0(%[input]) \n\t"
+ "sw $zero, 4(%[input]) \n\t"
+ "sw $zero, 8(%[input]) \n\t"
+ "sw $zero, 12(%[input]) \n\t"
+ "sw $zero, 16(%[input]) \n\t"
+ "sw $zero, 20(%[input]) \n\t"
+ "sw $zero, 24(%[input]) \n\t"
+ "sw $zero, 28(%[input]) \n\t"
+ "sw $zero, 32(%[input]) \n\t"
+ "sw $zero, 36(%[input]) \n\t"
+ "sw $zero, 40(%[input]) \n\t"
+ "sw $zero, 44(%[input]) \n\t"
+ "sw $zero, 48(%[input]) \n\t"
+ "sw $zero, 52(%[input]) \n\t"
+ "sw $zero, 56(%[input]) \n\t"
+ "sw $zero, 60(%[input]) \n\t" ::
+
+ [input] "r"(input));
+}
+
+static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
+ uint8_t *dest, int32_t dest_stride) {
+ v8i16 input_dc0, input_dc1, vec, res0, res1, res2, res3;
+ v16u8 dest0, dest1, dest2, dest3;
+ v16i8 zero = { 0 };
+
+ input_dc0 = __msa_fill_h(input[0] * dequant_input[0]);
+ input_dc1 = __msa_fill_h(input[16] * dequant_input[0]);
+ SRARI_H2_SH(input_dc0, input_dc1, 3);
+ vec = (v8i16)__msa_pckev_d((v2i64)input_dc1, (v2i64)input_dc0);
+ input[0] = 0;
+ input[16] = 0;
+ LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+ ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
+ res2, res3);
+ ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+ CLIP_SH4_0_255(res0, res1, res2, res3);
+ PCKEV_B2_SH(res1, res0, res3, res2, res0, res1);
+ ST8x4_UB(res0, res1, dest, dest_stride);
+}
+
+void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr,
+ int32_t pred_stride, uint8_t *dst_ptr,
+ int32_t dst_stride) {
+ idct4x4_addblk_msa(input, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+void vp8_dc_only_idct_add_msa(int16_t input_dc, uint8_t *pred_ptr,
+ int32_t pred_stride, uint8_t *dst_ptr,
+ int32_t dst_stride) {
+ idct4x4_addconst_msa(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+void vp8_dequantize_b_msa(BLOCKD *d, int16_t *DQC) {
+ v8i16 dqc0, dqc1, q0, q1, dq0, dq1;
+
+ LD_SH2(DQC, 8, dqc0, dqc1);
+ LD_SH2(d->qcoeff, 8, q0, q1);
+ MUL2(dqc0, q0, dqc1, q1, dq0, dq1);
+ ST_SH2(dq0, dq1, d->dqcoeff, 8);
+}
+
+void vp8_dequant_idct_add_msa(int16_t *input, int16_t *dq, uint8_t *dest,
+ int32_t stride) {
+ dequant_idct4x4_addblk_msa(input, dq, dest, stride);
+
+ __asm__ __volatile__(
+ "sw $zero, 0(%[input]) \n\t"
+ "sw $zero, 4(%[input]) \n\t"
+ "sw $zero, 8(%[input]) \n\t"
+ "sw $zero, 12(%[input]) \n\t"
+ "sw $zero, 16(%[input]) \n\t"
+ "sw $zero, 20(%[input]) \n\t"
+ "sw $zero, 24(%[input]) \n\t"
+ "sw $zero, 28(%[input]) \n\t"
+
+ :
+ : [input] "r"(input));
+}
+
+void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq, uint8_t *dst,
+ int32_t stride, char *eobs) {
+ int16_t *eobs_h = (int16_t *)eobs;
+ uint8_t i;
+
+ for (i = 4; i--;) {
+ if (eobs_h[0]) {
+ if (eobs_h[0] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_msa(q, dq, dst, stride);
+ } else {
+ dequant_idct_addconst_2x_msa(q, dq, dst, stride);
+ }
+ }
+
+ q += 32;
+
+ if (eobs_h[1]) {
+ if (eobs_h[1] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_msa(q, dq, dst + 8, stride);
+ } else {
+ dequant_idct_addconst_2x_msa(q, dq, dst + 8, stride);
+ }
+ }
+
+ q += 32;
+ dst += (4 * stride);
+ eobs_h += 2;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dst_u,
+ uint8_t *dst_v, int32_t stride,
+ char *eobs) {
+ int16_t *eobs_h = (int16_t *)eobs;
+
+ if (eobs_h[0]) {
+ if (eobs_h[0] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride);
+ } else {
+ dequant_idct_addconst_2x_msa(q, dq, dst_u, stride);
+ }
+ }
+
+ q += 32;
+ dst_u += (stride * 4);
+
+ if (eobs_h[1]) {
+ if (eobs_h[1] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride);
+ } else {
+ dequant_idct_addconst_2x_msa(q, dq, dst_u, stride);
+ }
+ }
+
+ q += 32;
+
+ if (eobs_h[2]) {
+ if (eobs_h[2] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride);
+ } else {
+ dequant_idct_addconst_2x_msa(q, dq, dst_v, stride);
+ }
+ }
+
+ q += 32;
+ dst_v += (stride * 4);
+
+ if (eobs_h[3]) {
+ if (eobs_h[3] & 0xfefe) {
+ dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride);
+ } else {
+ dequant_idct_addconst_2x_msa(q, dq, dst_v, stride);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c
new file mode 100644
index 0000000000..98a4fc09a3
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c
@@ -0,0 +1,709 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) \
+ { \
+ v16u8 p1_a_sub_q1, p0_a_sub_q0; \
+ \
+ p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \
+ p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \
+ p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1); \
+ p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \
+ mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \
+ mask = ((v16u8)mask <= b_limit); \
+ }
+
+#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \
+ { \
+ v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+ const v16i8 cnst4b = __msa_ldi_b(4); \
+ const v16i8 cnst3b = __msa_ldi_b(3); \
+ \
+ p1_m = (v16i8)__msa_xori_b(p1, 0x80); \
+ p0_m = (v16i8)__msa_xori_b(p0, 0x80); \
+ q0_m = (v16i8)__msa_xori_b(q0, 0x80); \
+ q1_m = (v16i8)__msa_xori_b(q1, 0x80); \
+ \
+ filt = __msa_subs_s_b(p1_m, q1_m); \
+ filt &= hev; \
+ q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt &= mask; \
+ t1 = __msa_adds_s_b(filt, cnst4b); \
+ t1 >>= cnst3b; \
+ t2 = __msa_adds_s_b(filt, cnst3b); \
+ t2 >>= cnst3b; \
+ q0_m = __msa_subs_s_b(q0_m, t1); \
+ q0 = __msa_xori_b((v16u8)q0_m, 0x80); \
+ p0_m = __msa_adds_s_b(p0_m, t2); \
+ p0 = __msa_xori_b((v16u8)p0_m, 0x80); \
+ filt = __msa_srari_b(t1, 1); \
+ hev = __msa_xori_b(hev, 0xff); \
+ filt &= hev; \
+ q1_m = __msa_subs_s_b(q1_m, filt); \
+ q1 = __msa_xori_b((v16u8)q1_m, 0x80); \
+ p1_m = __msa_adds_s_b(p1_m, filt); \
+ p1 = __msa_xori_b((v16u8)p1_m, 0x80); \
+ }
+
+#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \
+ { \
+ v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2; \
+ v16i8 q0_sub_p0; \
+ const v16i8 cnst4b = __msa_ldi_b(4); \
+ const v16i8 cnst3b = __msa_ldi_b(3); \
+ \
+ p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
+ p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
+ q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
+ q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
+ \
+ filt = __msa_subs_s_b(p1_m, q1_m); \
+ q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt &= mask; \
+ filt1 = __msa_adds_s_b(filt, cnst4b); \
+ filt1 >>= cnst3b; \
+ filt2 = __msa_adds_s_b(filt, cnst3b); \
+ filt2 >>= cnst3b; \
+ q0_m = __msa_subs_s_b(q0_m, filt1); \
+ p0_m = __msa_adds_s_b(p0_m, filt2); \
+ q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \
+ p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \
+ }
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
+ { \
+ v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \
+ v16i8 u, filt, t1, t2, filt_sign, q0_sub_p0; \
+ v8i16 filt_r, filt_l, u_r, u_l; \
+ v8i16 temp0, temp1, temp2, temp3; \
+ const v16i8 cnst4b = __msa_ldi_b(4); \
+ const v16i8 cnst3b = __msa_ldi_b(3); \
+ const v8i16 cnst9h = __msa_ldi_h(9); \
+ const v8i16 cnst63h = __msa_ldi_h(63); \
+ \
+ p2_m = (v16i8)__msa_xori_b(p2, 0x80); \
+ p1_m = (v16i8)__msa_xori_b(p1, 0x80); \
+ p0_m = (v16i8)__msa_xori_b(p0, 0x80); \
+ q0_m = (v16i8)__msa_xori_b(q0, 0x80); \
+ q1_m = (v16i8)__msa_xori_b(q1, 0x80); \
+ q2_m = (v16i8)__msa_xori_b(q2, 0x80); \
+ \
+ filt = __msa_subs_s_b(p1_m, q1_m); \
+ q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt = __msa_adds_s_b(filt, q0_sub_p0); \
+ filt &= mask; \
+ \
+ t2 = filt & hev; \
+ hev = __msa_xori_b(hev, 0xff); \
+ filt &= hev; \
+ t1 = __msa_adds_s_b(t2, cnst4b); \
+ t1 >>= cnst3b; \
+ t2 = __msa_adds_s_b(t2, cnst3b); \
+ t2 >>= cnst3b; \
+ q0_m = __msa_subs_s_b(q0_m, t1); \
+ p0_m = __msa_adds_s_b(p0_m, t2); \
+ filt_sign = __msa_clti_s_b(filt, 0); \
+ ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \
+ temp0 = filt_r * cnst9h; \
+ temp1 = temp0 + cnst63h; \
+ temp2 = filt_l * cnst9h; \
+ temp3 = temp2 + cnst63h; \
+ \
+ u_r = temp1 >> 7; \
+ u_r = __msa_sat_s_h(u_r, 7); \
+ u_l = temp3 >> 7; \
+ u_l = __msa_sat_s_h(u_l, 7); \
+ u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \
+ q2_m = __msa_subs_s_b(q2_m, u); \
+ p2_m = __msa_adds_s_b(p2_m, u); \
+ q2 = __msa_xori_b((v16u8)q2_m, 0x80); \
+ p2 = __msa_xori_b((v16u8)p2_m, 0x80); \
+ \
+ temp1 += temp0; \
+ temp3 += temp2; \
+ \
+ u_r = temp1 >> 7; \
+ u_r = __msa_sat_s_h(u_r, 7); \
+ u_l = temp3 >> 7; \
+ u_l = __msa_sat_s_h(u_l, 7); \
+ u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \
+ q1_m = __msa_subs_s_b(q1_m, u); \
+ p1_m = __msa_adds_s_b(p1_m, u); \
+ q1 = __msa_xori_b((v16u8)q1_m, 0x80); \
+ p1 = __msa_xori_b((v16u8)p1_m, 0x80); \
+ \
+ temp1 += temp0; \
+ temp3 += temp2; \
+ \
+ u_r = temp1 >> 7; \
+ u_r = __msa_sat_s_h(u_r, 7); \
+ u_l = temp3 >> 7; \
+ u_l = __msa_sat_s_h(u_l, 7); \
+ u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \
+ q0_m = __msa_subs_s_b(q0_m, u); \
+ p0_m = __msa_adds_s_b(p0_m, u); \
+ q0 = __msa_xori_b((v16u8)q0_m, 0x80); \
+ p0 = __msa_xori_b((v16u8)p0_m, 0x80); \
+ }
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+ limit_in, b_limit_in, thresh_in, hev_out, mask_out, \
+ flat_out) \
+ { \
+ v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
+ v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
+ \
+ p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in)); \
+ p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in)); \
+ p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in)); \
+ q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in)); \
+ q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in)); \
+ q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in)); \
+ p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in)); \
+ p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in)); \
+ flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
+ hev_out = (thresh_in) < (v16u8)flat_out; \
+ p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
+ p1_asub_q1_m >>= 1; \
+ p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
+ mask_out = (b_limit_in) < p0_asub_q0_m; \
+ mask_out = __msa_max_u_b(flat_out, mask_out); \
+ p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
+ mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
+ q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
+ mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
+ mask_out = (limit_in) < (v16u8)mask_out; \
+ mask_out = __msa_xori_b(mask_out, 0xff); \
+ }
+
+#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) \
+ { \
+ uint16_t tmp0_h; \
+ uint32_t tmp0_w; \
+ \
+ tmp0_w = __msa_copy_u_w((v4i32)in0, in0_idx); \
+ tmp0_h = __msa_copy_u_h((v8i16)in1, in1_idx); \
+ SW(tmp0_w, pdst); \
+ SH(tmp0_h, pdst + stride); \
+ }
+
+static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ v16u8 mask, hev, flat;
+ v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+ thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+ thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+ thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+ b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+ b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+ b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+ limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+ limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+ limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+ ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+static void loop_filter_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit0_ptr,
+ const uint8_t *limit0_ptr,
+ const uint8_t *thresh0_ptr,
+ const uint8_t *b_limit1_ptr,
+ const uint8_t *limit1_ptr,
+ const uint8_t *thresh1_ptr) {
+ v16u8 mask, hev, flat;
+ v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+ v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+ LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
+ row14, row15);
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+ thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+ thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+ b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+ b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+ b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+ limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+ limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+ limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+ ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+ ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+ src -= 2;
+ ST4x8_UB(tmp2, tmp3, src, pitch);
+ src += (8 * pitch);
+ ST4x8_UB(tmp4, tmp5, src, pitch);
+}
+
+static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch,
+ const uint8_t b_limit_in,
+ const uint8_t limit_in,
+ const uint8_t thresh_in) {
+ uint8_t *temp_src;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 mask, hev, flat, thresh, limit, b_limit;
+
+ b_limit = (v16u8)__msa_fill_b(b_limit_in);
+ limit = (v16u8)__msa_fill_b(limit_in);
+ thresh = (v16u8)__msa_fill_b(thresh_in);
+ temp_src = src - (pitch << 2);
+ LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+ temp_src = src - 3 * pitch;
+ ST_UB4(p2, p1, p0, q0, temp_src, pitch);
+ temp_src += (4 * pitch);
+ ST_UB2(q1, q2, temp_src, pitch);
+}
+
+static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch,
+ const uint8_t b_limit_in,
+ const uint8_t limit_in,
+ const uint8_t thresh_in) {
+ uint8_t *temp_src;
+ uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 mask, hev, flat, thresh, limit, b_limit;
+ v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+ v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+ b_limit = (v16u8)__msa_fill_b(b_limit_in);
+ limit = (v16u8)__msa_fill_b(limit_in);
+ thresh = (v16u8)__msa_fill_b(thresh_in);
+
+ temp_src = src_u - (pitch << 2);
+ LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+ temp_src = src_v - (pitch << 2);
+ LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+
+ ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+ ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+ p2_d = __msa_copy_u_d((v2i64)p2, 0);
+ p1_d = __msa_copy_u_d((v2i64)p1, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1, 0);
+ q2_d = __msa_copy_u_d((v2i64)q2, 0);
+ src_u -= (pitch * 3);
+ SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
+ src_u += 4 * pitch;
+ SD(q1_d, src_u);
+ src_u += pitch;
+ SD(q2_d, src_u);
+
+ p2_d = __msa_copy_u_d((v2i64)p2, 1);
+ p1_d = __msa_copy_u_d((v2i64)p1, 1);
+ p0_d = __msa_copy_u_d((v2i64)p0, 1);
+ q0_d = __msa_copy_u_d((v2i64)q0, 1);
+ q1_d = __msa_copy_u_d((v2i64)q1, 1);
+ q2_d = __msa_copy_u_d((v2i64)q2, 1);
+ src_v -= (pitch * 3);
+ SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
+ src_v += 4 * pitch;
+ SD(q1_d, src_v);
+ src_v += pitch;
+ SD(q2_d, src_v);
+}
+
+static void mbloop_filter_vertical_edge_y_msa(uint8_t *src, int32_t pitch,
+ const uint8_t b_limit_in,
+ const uint8_t limit_in,
+ const uint8_t thresh_in) {
+ uint8_t *temp_src;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 mask, hev, flat, thresh, limit, b_limit;
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+ v16u8 row9, row10, row11, row12, row13, row14, row15;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ b_limit = (v16u8)__msa_fill_b(b_limit_in);
+ limit = (v16u8)__msa_fill_b(limit_in);
+ thresh = (v16u8)__msa_fill_b(thresh_in);
+ temp_src = src - 4;
+ LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+ temp_src += (8 * pitch);
+ LD_UB8(temp_src, pitch, row8, row9, row10, row11, row12, row13, row14, row15);
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+ ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+ ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+ ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+ temp_src = src - 3;
+ VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
+ temp_src += pitch;
+ VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+static void mbloop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch,
+ const uint8_t b_limit_in,
+ const uint8_t limit_in,
+ const uint8_t thresh_in) {
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 mask, hev, flat, thresh, limit, b_limit;
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+ v16u8 row9, row10, row11, row12, row13, row14, row15;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ b_limit = (v16u8)__msa_fill_b(b_limit_in);
+ limit = (v16u8)__msa_fill_b(limit_in);
+ thresh = (v16u8)__msa_fill_b(thresh_in);
+
+ LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+ LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14,
+ row15);
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+ ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+ ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+ ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+ ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+ src_u -= 3;
+ VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
+ src_u += pitch;
+ VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);
+
+ src_v -= 3;
+ VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
+ src_v += pitch;
+ VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void vp8_loop_filter_simple_horizontal_edge_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr) {
+ v16u8 p1, p0, q1, q0;
+ v16u8 mask, b_limit;
+
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
+ VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+ VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+ ST_UB2(p0, q0, (src - pitch), pitch);
+}
+
+void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch,
+ const uint8_t *b_limit_ptr) {
+ uint8_t *temp_src;
+ v16u8 p1, p0, q1, q0;
+ v16u8 mask, b_limit;
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+ v16u8 row9, row10, row11, row12, row13, row14, row15;
+ v8i16 tmp0, tmp1;
+
+ b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+ temp_src = src - 2;
+ LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+ temp_src += (8 * pitch);
+ LD_UB8(temp_src, pitch, row8, row9, row10, row11, row12, row13, row14, row15);
+ TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p1, p0,
+ q0, q1);
+ VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+ VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+ ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+
+ src -= 1;
+ ST2x4_UB(tmp1, 0, src, pitch);
+ src += 4 * pitch;
+ ST2x4_UB(tmp1, 4, src, pitch);
+ src += 4 * pitch;
+ ST2x4_UB(tmp0, 0, src, pitch);
+ src += 4 * pitch;
+ ST2x4_UB(tmp0, 4, src, pitch);
+ src += 4 * pitch;
+}
+
+static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch,
+ const uint8_t b_limit_in,
+ const uint8_t limit_in,
+ const uint8_t thresh_in) {
+ uint64_t p1_d, p0_d, q0_d, q1_d;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 mask, hev, flat, thresh, limit, b_limit;
+ v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+ v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+ thresh = (v16u8)__msa_fill_b(thresh_in);
+ limit = (v16u8)__msa_fill_b(limit_in);
+ b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+ src_u = src_u - (pitch << 2);
+ LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+ src_u += (5 * pitch);
+ src_v = src_v - (pitch << 2);
+ LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+ src_v += (5 * pitch);
+
+ /* right 8 element of p3 are u pixel and
+ left 8 element of p3 are v pixel */
+ ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+ ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+ p1_d = __msa_copy_u_d((v2i64)p1, 0);
+ p0_d = __msa_copy_u_d((v2i64)p0, 0);
+ q0_d = __msa_copy_u_d((v2i64)q0, 0);
+ q1_d = __msa_copy_u_d((v2i64)q1, 0);
+ SD4(q1_d, q0_d, p0_d, p1_d, src_u, (-pitch));
+
+ p1_d = __msa_copy_u_d((v2i64)p1, 1);
+ p0_d = __msa_copy_u_d((v2i64)p0, 1);
+ q0_d = __msa_copy_u_d((v2i64)q0, 1);
+ q1_d = __msa_copy_u_d((v2i64)q1, 1);
+ SD4(q1_d, q0_d, p0_d, p1_d, src_v, (-pitch));
+}
+
+static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch,
+ const uint8_t b_limit_in,
+ const uint8_t limit_in,
+ const uint8_t thresh_in) {
+ uint8_t *temp_src_u, *temp_src_v;
+ v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+ v16u8 mask, hev, flat, thresh, limit, b_limit;
+ v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+ v16u8 row9, row10, row11, row12, row13, row14, row15;
+ v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+ thresh = (v16u8)__msa_fill_b(thresh_in);
+ limit = (v16u8)__msa_fill_b(limit_in);
+ b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+ LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+ LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14,
+ row15);
+ TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+ row9, row10, row11, row12, row13, row14, row15, p3, p2,
+ p1, p0, q0, q1, q2, q3);
+
+ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+ mask, flat);
+ VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+ ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+ ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+ tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1);
+ tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0);
+ ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+
+ temp_src_u = src_u - 2;
+ ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
+ temp_src_u += 4 * pitch;
+ ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
+
+ temp_src_v = src_v - 2;
+ ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
+ temp_src_v += 4 * pitch;
+ ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
+}
+
+void vp8_loop_filter_mbh_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch_y, int32_t pitch_u_v,
+ loop_filter_info *lpf_info_ptr) {
+ mbloop_filter_horizontal_edge_y_msa(src_y, pitch_y, *lpf_info_ptr->mblim,
+ *lpf_info_ptr->lim,
+ *lpf_info_ptr->hev_thr);
+ if (src_u) {
+ mbloop_filter_horizontal_edge_uv_msa(
+ src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+ *lpf_info_ptr->hev_thr);
+ }
+}
+
+void vp8_loop_filter_mbv_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch_y, int32_t pitch_u_v,
+ loop_filter_info *lpf_info_ptr) {
+ mbloop_filter_vertical_edge_y_msa(src_y, pitch_y, *lpf_info_ptr->mblim,
+ *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+ if (src_u) {
+ mbloop_filter_vertical_edge_uv_msa(src_u, src_v, pitch_u_v,
+ *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+ *lpf_info_ptr->hev_thr);
+ }
+}
+
+void vp8_loop_filter_bh_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch_y, int32_t pitch_u_v,
+ loop_filter_info *lpf_info_ptr) {
+ loop_filter_horizontal_4_dual_msa(src_y + 4 * pitch_y, pitch_y,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+ loop_filter_horizontal_4_dual_msa(src_y + 8 * pitch_y, pitch_y,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+ loop_filter_horizontal_4_dual_msa(src_y + 12 * pitch_y, pitch_y,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+ if (src_u) {
+ loop_filter_horizontal_edge_uv_msa(
+ src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v,
+ *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+ }
+}
+
+void vp8_loop_filter_bv_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+ int32_t pitch_y, int32_t pitch_u_v,
+ loop_filter_info *lpf_info_ptr) {
+ loop_filter_vertical_4_dual_msa(src_y + 4, pitch_y, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr);
+ loop_filter_vertical_4_dual_msa(src_y + 8, pitch_y, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr);
+ loop_filter_vertical_4_dual_msa(src_y + 12, pitch_y, lpf_info_ptr->blim,
+ lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+ lpf_info_ptr->blim, lpf_info_ptr->lim,
+ lpf_info_ptr->hev_thr);
+ if (src_u) {
+ loop_filter_vertical_edge_uv_msa(src_u + 4, src_v + 4, pitch_u_v,
+ *lpf_info_ptr->blim, *lpf_info_ptr->lim,
+ *lpf_info_ptr->hev_thr);
+ }
+}
+
+void vp8_loop_filter_bhs_msa(uint8_t *src_y, int32_t pitch_y,
+ const uint8_t *b_limit_ptr) {
+ vp8_loop_filter_simple_horizontal_edge_msa(src_y + (4 * pitch_y), pitch_y,
+ b_limit_ptr);
+ vp8_loop_filter_simple_horizontal_edge_msa(src_y + (8 * pitch_y), pitch_y,
+ b_limit_ptr);
+ vp8_loop_filter_simple_horizontal_edge_msa(src_y + (12 * pitch_y), pitch_y,
+ b_limit_ptr);
+}
+
+void vp8_loop_filter_bvs_msa(uint8_t *src_y, int32_t pitch_y,
+ const uint8_t *b_limit_ptr) {
+ vp8_loop_filter_simple_vertical_edge_msa(src_y + 4, pitch_y, b_limit_ptr);
+ vp8_loop_filter_simple_vertical_edge_msa(src_y + 8, pitch_y, b_limit_ptr);
+ vp8_loop_filter_simple_vertical_edge_msa(src_y + 12, pitch_y, b_limit_ptr);
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/mfqe_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/mfqe_msa.c
new file mode 100644
index 0000000000..9aac95b2fa
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/msa/mfqe_msa.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/postproc.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ int32_t src_weight) {
+ int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ int32_t row;
+ uint64_t src0_d, src1_d, dst0_d, dst1_d;
+ v16i8 src0 = { 0 };
+ v16i8 src1 = { 0 };
+ v16i8 dst0 = { 0 };
+ v16i8 dst1 = { 0 };
+ v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+ src_wt = __msa_fill_h(src_weight);
+ dst_wt = __msa_fill_h(dst_weight);
+
+ for (row = 2; row--;) {
+ LD2(src_ptr, src_stride, src0_d, src1_d);
+ src_ptr += (2 * src_stride);
+ LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
+ INSERT_D2_SB(src0_d, src1_d, src0);
+ INSERT_D2_SB(dst0_d, dst1_d, dst0);
+
+ LD2(src_ptr, src_stride, src0_d, src1_d);
+ src_ptr += (2 * src_stride);
+ LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
+ INSERT_D2_SB(src0_d, src1_d, src1);
+ INSERT_D2_SB(dst0_d, dst1_d, dst1);
+
+ UNPCK_UB_SH(src0, src_r, src_l);
+ UNPCK_UB_SH(dst0, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+ ST8x2_UB(dst0, dst_ptr, dst_stride);
+ dst_ptr += (2 * dst_stride);
+
+ UNPCK_UB_SH(src1, src_r, src_l);
+ UNPCK_UB_SH(dst1, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+ ST8x2_UB(dst1, dst_ptr, dst_stride);
+ dst_ptr += (2 * dst_stride);
+ }
+}
+
+static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ int32_t src_weight) {
+ int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ int32_t row;
+ v16i8 src0, src1, src2, src3;
+ v16i8 dst0, dst1, dst2, dst3;
+ v8i16 src_wt, dst_wt;
+ v8i16 res_h_r, res_h_l;
+ v8i16 src_r, src_l, dst_r, dst_l;
+
+ src_wt = __msa_fill_h(src_weight);
+ dst_wt = __msa_fill_h(dst_weight);
+
+ for (row = 4; row--;) {
+ LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
+
+ UNPCK_UB_SH(src0, src_r, src_l);
+ UNPCK_UB_SH(dst0, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+
+ UNPCK_UB_SH(src1, src_r, src_l);
+ UNPCK_UB_SH(dst1, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+
+ UNPCK_UB_SH(src2, src_r, src_l);
+ UNPCK_UB_SH(dst2, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+
+ UNPCK_UB_SH(src3, src_r, src_l);
+ UNPCK_UB_SH(dst3, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+ }
+}
+
+void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ int32_t src_weight) {
+ filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride,
+ src_weight);
+}
+
+void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ int32_t src_weight) {
+ filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride, src_weight);
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c
new file mode 100644
index 0000000000..3a1bb7cd57
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c
@@ -0,0 +1,1738 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/filter.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_msa[7][8]) = {
+ { 0, -6, 123, 12, -1, 0, 0, 0 },
+ { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
+ { 0, -9, 93, 50, -6, 0, 0, 0 },
+ { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
+ { 0, -6, 50, 93, -9, 0, 0, 0 },
+ { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
+ { 0, -1, 12, 123, -6, 0, 0, 0 },
+};
+
+static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \
+ filt_h2) \
+ ({ \
+ v16i8 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m; \
+ v8i16 _6tap_out_m; \
+ \
+ VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \
+ _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m); \
+ _6tap_out_m = DPADD_SH3_SH(_6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m, \
+ filt_h0, filt_h1, filt_h2); \
+ \
+ _6tap_out_m = __msa_srari_h(_6tap_out_m, VP8_FILTER_SHIFT); \
+ _6tap_out_m = __msa_sat_s_h(_6tap_out_m, 7); \
+ \
+ _6tap_out_m; \
+ })
+
+#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ mask2, filt0, filt1, filt2, out0, out1) \
+ { \
+ v16i8 _6tap_4wid_vec0_m, _6tap_4wid_vec1_m, _6tap_4wid_vec2_m, \
+ _6tap_4wid_vec3_m, _6tap_4wid_vec4_m, _6tap_4wid_vec5_m; \
+ \
+ VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _6tap_4wid_vec0_m, \
+ _6tap_4wid_vec1_m); \
+ DOTP_SB2_SH(_6tap_4wid_vec0_m, _6tap_4wid_vec1_m, filt0, filt0, out0, \
+ out1); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _6tap_4wid_vec2_m, \
+ _6tap_4wid_vec3_m); \
+ DPADD_SB2_SH(_6tap_4wid_vec2_m, _6tap_4wid_vec3_m, filt1, filt1, out0, \
+ out1); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, _6tap_4wid_vec4_m, \
+ _6tap_4wid_vec5_m); \
+ DPADD_SB2_SH(_6tap_4wid_vec4_m, _6tap_4wid_vec5_m, filt2, filt2, out0, \
+ out1); \
+ }
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ mask2, filt0, filt1, filt2, out0, out1, \
+ out2, out3) \
+ { \
+ v16i8 _6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \
+ _6tap_8wid_vec3_m, _6tap_8wid_vec4_m, _6tap_8wid_vec5_m, \
+ _6tap_8wid_vec6_m, _6tap_8wid_vec7_m; \
+ \
+ VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _6tap_8wid_vec0_m, \
+ _6tap_8wid_vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _6tap_8wid_vec2_m, \
+ _6tap_8wid_vec3_m); \
+ DOTP_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \
+ _6tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \
+ out2, out3); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _6tap_8wid_vec0_m, \
+ _6tap_8wid_vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _6tap_8wid_vec2_m, \
+ _6tap_8wid_vec3_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, _6tap_8wid_vec4_m, \
+ _6tap_8wid_vec5_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, _6tap_8wid_vec6_m, \
+ _6tap_8wid_vec7_m); \
+ DPADD_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \
+ _6tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
+ out2, out3); \
+ DPADD_SB4_SH(_6tap_8wid_vec4_m, _6tap_8wid_vec5_m, _6tap_8wid_vec6_m, \
+ _6tap_8wid_vec7_m, filt2, filt2, filt2, filt2, out0, out1, \
+ out2, out3); \
+ }
+
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
+ ({ \
+ v8i16 _4tap_dpadd_tmp0; \
+ \
+ _4tap_dpadd_tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
+ _4tap_dpadd_tmp0 = \
+ __msa_dpadd_s_h(_4tap_dpadd_tmp0, (v16i8)vec1, (v16i8)filt1); \
+ \
+ _4tap_dpadd_tmp0; \
+ })
+
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
+ ({ \
+ v16i8 _4tap_vec0_m, _4tap_vec1_m; \
+ v8i16 _4tap_out_m; \
+ \
+ VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, _4tap_vec0_m, \
+ _4tap_vec1_m); \
+ _4tap_out_m = \
+ FILT_4TAP_DPADD_S_H(_4tap_vec0_m, _4tap_vec1_m, filt_h0, filt_h1); \
+ \
+ _4tap_out_m = __msa_srari_h(_4tap_out_m, VP8_FILTER_SHIFT); \
+ _4tap_out_m = __msa_sat_s_h(_4tap_out_m, 7); \
+ \
+ _4tap_out_m; \
+ })
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ filt0, filt1, out0, out1) \
+ { \
+ v16i8 _4tap_4wid_vec0_m, _4tap_4wid_vec1_m, _4tap_4wid_vec2_m, \
+ _4tap_4wid_vec3_m; \
+ \
+ VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _4tap_4wid_vec0_m, \
+ _4tap_4wid_vec1_m); \
+ DOTP_SB2_SH(_4tap_4wid_vec0_m, _4tap_4wid_vec1_m, filt0, filt0, out0, \
+ out1); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _4tap_4wid_vec2_m, \
+ _4tap_4wid_vec3_m); \
+ DPADD_SB2_SH(_4tap_4wid_vec2_m, _4tap_4wid_vec3_m, filt1, filt1, out0, \
+ out1); \
+ }
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
+ filt0, filt1, out0, out1, out2, out3) \
+ { \
+ v16i8 _4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \
+ _4tap_8wid_vec3_m; \
+ \
+ VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _4tap_8wid_vec0_m, \
+ _4tap_8wid_vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _4tap_8wid_vec2_m, \
+ _4tap_8wid_vec3_m); \
+ DOTP_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \
+ _4tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \
+ out2, out3); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _4tap_8wid_vec0_m, \
+ _4tap_8wid_vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _4tap_8wid_vec2_m, \
+ _4tap_8wid_vec3_m); \
+ DPADD_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \
+ _4tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
+ out2, out3); \
+ }
+
+static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+ v16u8 mask0, mask1, mask2, out;
+ v8i16 filt, out0, out1;
+
+ mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+ src -= 2;
+
+ filt = LD_SH(filter);
+ SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+ filt1, filt2, out0, out1);
+ SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
+ SAT_SH2_SH(out0, out1, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_6t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+ v16u8 mask0, mask1, mask2, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+ src -= 2;
+
+ filt = LD_SH(filter);
+ SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+ filt1, filt2, out0, out1);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+ filt1, filt2, out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+ v16u8 mask0, mask1, mask2, tmp0, tmp1;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+ src -= 2;
+
+ filt = LD_SH(filter);
+ SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+ filt1, filt2, out0, out1, out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ tmp0 = PCKEV_XORI128_UB(out0, out1);
+ tmp1 = PCKEV_XORI128_UB(out2, out3);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ filt0, filt1, filt2, out0, out1, out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ tmp0 = PCKEV_XORI128_UB(out0, out1);
+ tmp1 = PCKEV_XORI128_UB(out2, out3);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hz_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
+ v16u8 mask0, mask1, mask2, out;
+ v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+ mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+ src -= 2;
+
+ filt = LD_SH(filter);
+ SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (4 * src_stride);
+
+ HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ filt0, filt1, filt2, out0, out1, out2, out3);
+ HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+ filt0, filt1, filt2, out4, out5, out6, out7);
+ SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+ SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ SAT_SH4_SH(out4, out5, out6, out7, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ out = PCKEV_XORI128_UB(out4, out5);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ out = PCKEV_XORI128_UB(out6, out7);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_vt_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+ v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+ v16u8 out;
+ v8i16 filt, out10, out32;
+
+ src -= (2 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ XORI_B2_128_SB(src2110, src4332);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src5, src6, src7, src8);
+ src += (4 * src_stride);
+
+ ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+ src76_r, src87_r);
+ ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+ XORI_B2_128_SB(src6554, src8776);
+ out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+ out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+ SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
+ SAT_SH2_SH(out10, out32, 7);
+ out = PCKEV_XORI128_UB(out10, out32);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src2110 = src6554;
+ src4332 = src8776;
+ src4 = src8;
+ }
+}
+
+static void common_vt_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+ v16i8 src109_r, filt0, filt1, filt2;
+ v16u8 tmp0, tmp1;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+ src -= (2 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ XORI_B5_128_SB(src0, src1, src2, src3, src4);
+ ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, src10_r, src32_r,
+ src21_r, src43_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+ out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+ out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+ out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+ tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src76_r;
+ src32_r = src98_r;
+ src21_r = src87_r;
+ src43_r = src109_r;
+ src4 = src10;
+ }
+}
+
+static void common_vt_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+ v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+ v16i8 src65_l, src87_l, filt0, filt1, filt2;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+ src -= (2 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ XORI_B5_128_SB(src0, src1, src2, src3, src4);
+ ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r, src32_r,
+ src43_r, src21_r);
+ ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l, src32_l,
+ src43_l, src21_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src5, src6, src7, src8);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src5, src6, src7, src8);
+ ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+ src76_r, src87_r);
+ ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
+ src76_l, src87_l);
+ out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+ out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+ out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+ out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+ out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+ out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+ out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+ out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+ SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+ PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
+ tmp0, tmp1, tmp2, tmp3);
+ XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+ ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src4 = src8;
+ }
+}
+
+static void common_hv_6ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16i8 filt_hz0, filt_hz1, filt_hz2;
+ v16u8 mask0, mask1, mask2, out;
+ v8i16 tmp0, tmp1;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+ src -= (2 + 2 * src_stride);
+
+ filt = LD_SH(filter_horiz);
+ SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+ filt = LD_SH(filter_vert);
+ SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ XORI_B5_128_SB(src0, src1, src2, src3, src4);
+ hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB2(src, src_stride, src5, src6);
+ src += (2 * src_stride);
+
+ XORI_B2_128_SB(src5, src6);
+ hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+
+ LD_SB2(src, src_stride, src7, src8);
+ src += (2 * src_stride);
+
+ XORI_B2_128_SB(src7, src8);
+ hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+
+ out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+ tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+ out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+ SRARI_H2_SH(tmp0, tmp1, 7);
+ SAT_SH2_SH(tmp0, tmp1, 7);
+ out = PCKEV_XORI128_UB(tmp0, tmp1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out3 = hz_out7;
+ out0 = out2;
+ out1 = out3;
+ }
+}
+
+static void common_hv_6ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16i8 filt_hz0, filt_hz1, filt_hz2;
+ v16u8 mask0, mask1, mask2, vec0, vec1;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 tmp0, tmp1, tmp2, tmp3;
+
+ mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+ src -= (2 + 2 * src_stride);
+
+ filt = LD_SH(filter_horiz);
+ SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ XORI_B5_128_SB(src0, src1, src2, src3, src4);
+ hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+ ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src5, src6, src7, src8);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src5, src6, src7, src8);
+ hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+ tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
+ tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+ tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+ vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+ ST8x4_UB(vec0, vec1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out4 = hz_out8;
+ out0 = out2;
+ out1 = out7;
+ out3 = out5;
+ out4 = out6;
+ }
+}
+
+static void common_hv_6ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hz_4t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+ v8i16 filt, out0, out1;
+ v16u8 out;
+
+ mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
+ src -= 1;
+
+ filt = LD_SH(filter);
+ SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+ mask1 = mask0 + 2;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+ out0, out1);
+ SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
+ SAT_SH2_SH(out0, out1, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter) {
+ v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+ v16u8 out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
+ src -= 1;
+
+ filt = LD_SH(filter);
+ SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+ mask1 = mask0 + 2;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+ out0, out1);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+ v16u8 tmp0, tmp1;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+ src -= 1;
+
+ filt = LD_SH(filter);
+ SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+ mask1 = mask0 + 2;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+ filt1, out0, out1, out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ tmp0 = PCKEV_XORI128_UB(out0, out1);
+ tmp1 = PCKEV_XORI128_UB(out2, out3);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hz_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 filt0, filt1, mask0, mask1;
+ v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+ v16u8 out;
+
+ mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+ src -= 1;
+
+ filt = LD_SH(filter);
+ SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+ mask1 = mask0 + 2;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+ HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+ filt1, out0, out1, out2, out3);
+ HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+ filt1, out4, out5, out6, out7);
+ SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+ SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ SAT_SH4_SH(out4, out5, out6, out7, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ out = PCKEV_XORI128_UB(out4, out5);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ out = PCKEV_XORI128_UB(out6, out7);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_vt_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5;
+ v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+ v16i8 src2110, src4332, filt0, filt1;
+ v8i16 filt, out10, out32;
+ v16u8 out;
+
+ src -= src_stride;
+
+ filt = LD_SH(filter);
+ SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+ LD_SB3(src, src_stride, src0, src1, src2);
+ src += (3 * src_stride);
+
+ ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+ src2110 = (v16i8)__msa_ilvr_d((v2i64)src21_r, (v2i64)src10_r);
+ src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB3(src, src_stride, src3, src4, src5);
+ src += (3 * src_stride);
+ ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+ src4332 = (v16i8)__msa_ilvr_d((v2i64)src43_r, (v2i64)src32_r);
+ src4332 = (v16i8)__msa_xori_b((v16u8)src4332, 128);
+ out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
+
+ src2 = LD_SB(src);
+ src += (src_stride);
+ ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
+ src2110 = (v16i8)__msa_ilvr_d((v2i64)src65_r, (v2i64)src54_r);
+ src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
+ out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
+ SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
+ SAT_SH2_SH(out10, out32, 7);
+ out = PCKEV_XORI128_UB(out10, out32);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_vt_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src7, src8, src9, src10;
+ v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+ v16u8 tmp0, tmp1;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+ src -= src_stride;
+
+ filt = LD_SH(filter);
+ SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+ LD_SB3(src, src_stride, src0, src1, src2);
+ src += (3 * src_stride);
+
+ XORI_B3_128_SB(src0, src1, src2);
+ ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, src72_r,
+ src87_r, src98_r, src109_r);
+ out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
+ out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
+ out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
+ out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+ tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src98_r;
+ src21_r = src109_r;
+ src2 = src10;
+ }
+}
+
+static void common_vt_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6;
+ v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+ v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+ src -= src_stride;
+
+ filt = LD_SH(filter);
+ SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+ LD_SB3(src, src_stride, src0, src1, src2);
+ src += (3 * src_stride);
+
+ XORI_B3_128_SB(src0, src1, src2);
+ ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+ ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src3, src4, src5, src6);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src3, src4, src5, src6);
+ ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
+ src54_r, src65_r);
+ ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_l, src43_l,
+ src54_l, src65_l);
+ out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+ out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+ out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
+ out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
+ out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+ out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+ out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
+ out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+ SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+ PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
+ tmp0, tmp1, tmp2, tmp3);
+ XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+ ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src21_r = src65_r;
+ src10_l = src54_l;
+ src21_l = src65_l;
+ src2 = src6;
+ }
+}
+
+static void common_hv_4ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+ v16u8 mask0, mask1, out;
+ v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+ mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+ src -= (1 + 1 * src_stride);
+
+ filt = LD_SH(filter_horiz);
+ SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+ mask1 = mask0 + 2;
+
+ LD_SB3(src, src_stride, src0, src1, src2);
+ src += (3 * src_stride);
+
+ XORI_B3_128_SB(src0, src1, src2);
+ hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+ vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src3, src4, src5, src6);
+ src += (4 * src_stride);
+
+ XORI_B2_128_SB(src3, src4);
+ hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
+ vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+ tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+ XORI_B2_128_SB(src5, src6);
+ hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+ vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+ tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+ SRARI_H2_SH(tmp0, tmp1, 7);
+ SAT_SH2_SH(tmp0, tmp1, 7);
+ out = PCKEV_XORI128_UB(tmp0, tmp1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out1 = hz_out5;
+ vec0 = vec2;
+ }
+}
+
+static void common_hv_4ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+ v16u8 mask0, mask1, out0, out1;
+ v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+ v8i16 vec0, vec1, vec2, vec3, vec4;
+
+ mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+ src -= (1 + 1 * src_stride);
+
+ filt = LD_SH(filter_horiz);
+ SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+ mask1 = mask0 + 2;
+
+ LD_SB3(src, src_stride, src0, src1, src2);
+ src += (3 * src_stride);
+
+ XORI_B3_128_SB(src0, src1, src2);
+ hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src3, src4, src5, src6);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src3, src4, src5, src6);
+ hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+ vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+ tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+ hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+ vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
+ tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+ hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+ vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
+
+ hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+ ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
+ tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+ out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ vec0 = vec4;
+ vec2 = vec1;
+ }
+}
+
+static void common_hv_4ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_6ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6;
+ v16i8 filt_hz0, filt_hz1, filt_hz2;
+ v16u8 res0, res1, mask0, mask1, mask2;
+ v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+ mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+ src -= (2 + 1 * src_stride);
+
+ filt = LD_SH(filter_horiz);
+ SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+
+ LD_SB3(src, src_stride, src0, src1, src2);
+ src += (3 * src_stride);
+
+ XORI_B3_128_SB(src0, src1, src2);
+ hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src3, src4, src5, src6);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src3, src4, src5, src6);
+ hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
+ vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+ tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+ hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+ vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+ tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+ SRARI_H2_SH(tmp0, tmp1, 7);
+ SAT_SH2_SH(tmp0, tmp1, 7);
+ PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+ XORI_B2_128_UB(res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out1 = hz_out5;
+ vec0 = vec2;
+ }
+}
+
+static void common_hv_6ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+ v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+ v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+ v16u8 out0, out1;
+
+ mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+ src -= (2 + src_stride);
+
+ filt = LD_SH(filter_horiz);
+ SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+
+ LD_SB3(src, src_stride, src0, src1, src2);
+ src += (3 * src_stride);
+
+ XORI_B3_128_SB(src0, src1, src2);
+ hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ filt_hz2);
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src3, src4, src5, src6);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src3, src4, src5, src6);
+
+ hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+ tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+ hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
+ tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+ hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
+
+ hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+ filt_hz1, filt_hz2);
+ ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
+ tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+ out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hv_6ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_4ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16i8 filt_hz0, filt_hz1, mask0, mask1;
+ v16u8 out;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+
+ mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+ src -= (1 + 2 * src_stride);
+
+ filt = LD_SH(filter_horiz);
+ SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+ mask1 = mask0 + 2;
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ XORI_B5_128_SB(src0, src1, src2, src3, src4);
+ hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src5, src6, src7, src8);
+ XORI_B4_128_SB(src5, src6, src7, src8);
+ src += (4 * src_stride);
+
+ hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+ out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+ tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+ out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+ SRARI_H2_SH(tmp0, tmp1, 7);
+ SAT_SH2_SH(tmp0, tmp1, 7);
+ out = PCKEV_XORI128_UB(tmp0, tmp1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out3 = hz_out7;
+ out0 = out2;
+ out1 = out3;
+ }
+}
+
+static void common_hv_4ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16i8 filt_hz0, filt_hz1, mask0, mask1;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+ v16u8 vec0, vec1;
+
+ mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+ src -= (1 + 2 * src_stride);
+
+ filt = LD_SH(filter_horiz);
+ SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+ mask1 = mask0 + 2;
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ XORI_B5_128_SB(src0, src1, src2, src3, src4);
+ hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+ ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src5, src6, src7, src8);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src5, src6, src7, src8);
+
+ hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+ out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+ tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+ out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
+ tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+ out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+ hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+ out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+ tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+ vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+ ST8x4_UB(vec0, vec1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out4 = hz_out8;
+ out0 = out2;
+ out1 = out6;
+ out3 = out5;
+ out4 = out7;
+ }
+}
+
+static void common_hv_4ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ uint8_t *RESTRICT dst, int32_t dst_stride,
+ const int8_t *filter_horiz,
+ const int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+void vp8_sixtap_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ uint8_t *RESTRICT dst, int32_t dst_stride) {
+ const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+ const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+ if (yoffset) {
+ if (xoffset) {
+ switch (xoffset) {
+ case 2:
+ case 4:
+ case 6:
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_hv_6ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
+ h_filter, v_filter, 4);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hv_6ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
+ h_filter, v_filter + 1, 4);
+ break;
+ }
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_hv_4ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter, 4);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hv_4ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter + 1, 4);
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_vt_6t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_vt_4t_4w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
+ 4);
+ break;
+ }
+ }
+ } else {
+ switch (xoffset) {
+ case 0: {
+ uint32_t tp0, tp1, tp2, tp3;
+
+ LW4(src, src_stride, tp0, tp1, tp2, tp3);
+ SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
+ break;
+ }
+ case 2:
+ case 4:
+ case 6:
+ common_hz_6t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hz_4t_4w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
+ break;
+ }
+ }
+}
+
+void vp8_sixtap_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ uint8_t *RESTRICT dst, int32_t dst_stride) {
+ const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+ const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+ if (yoffset) {
+ if (xoffset) {
+ switch (xoffset) {
+ case 2:
+ case 4:
+ case 6:
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
+ h_filter, v_filter, 4);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
+ h_filter, v_filter + 1, 4);
+ break;
+ }
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter, 4);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter + 1, 4);
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
+ 4);
+ break;
+ }
+ }
+ } else {
+ switch (xoffset) {
+ case 0: vp8_copy_mem8x4(src, src_stride, dst, dst_stride); break;
+ case 2:
+ case 4:
+ case 6:
+ common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
+ break;
+ }
+ }
+}
+
+void vp8_sixtap_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ uint8_t *RESTRICT dst, int32_t dst_stride) {
+ const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+ const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+ if (yoffset) {
+ if (xoffset) {
+ switch (xoffset) {
+ case 2:
+ case 4:
+ case 6:
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
+ h_filter, v_filter, 8);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
+ h_filter, v_filter + 1, 8);
+ break;
+ }
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter, 8);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter + 1, 8);
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
+ 8);
+ break;
+ }
+ }
+ } else {
+ switch (xoffset) {
+ case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
+ case 2:
+ case 4:
+ case 6:
+ common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 8);
+ break;
+ }
+ }
+}
+
+void vp8_sixtap_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
+ int32_t xoffset, int32_t yoffset,
+ uint8_t *RESTRICT dst, int32_t dst_stride) {
+ const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+ const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+ if (yoffset) {
+ if (xoffset) {
+ switch (xoffset) {
+ case 2:
+ case 4:
+ case 6:
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_hv_6ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
+ h_filter, v_filter, 16);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hv_6ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
+ h_filter, v_filter + 1, 16);
+ break;
+ }
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_hv_4ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter, 16);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hv_4ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
+ h_filter + 1, v_filter + 1, 16);
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (yoffset) {
+ case 2:
+ case 4:
+ case 6:
+ common_vt_6t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_vt_4t_16w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
+ 16);
+ break;
+ }
+ }
+ } else {
+ switch (xoffset) {
+ case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
+ case 2:
+ case 4:
+ case 6:
+ common_hz_6t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ common_hz_4t_16w_msa(src, src_stride, dst, dst_stride, h_filter + 1,
+ 16);
+ break;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
new file mode 100644
index 0000000000..7cb3c98690
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
@@ -0,0 +1,1762 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+#define VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+
+#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+
+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
+#if (__mips_isa_rev >= 6)
+#define LW(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint32_t val_m; \
+ \
+ asm volatile("lw %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
+
+#if (__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint64_t val_m = 0; \
+ \
+ asm volatile("ld %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r"(val_m) \
+ : [psrc_m] "m"(*psrc_m)); \
+ \
+ val_m; \
+ })
+#else // !(__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_ld = (const uint8_t *)(psrc); \
+ uint32_t val0_m, val1_m; \
+ uint64_t val_m = 0; \
+ \
+ val0_m = LW(psrc_ld); \
+ val1_m = LW(psrc_ld + 4); \
+ \
+ val_m = (uint64_t)(val1_m); \
+ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
+ val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
+ \
+ val_m; \
+ })
+#endif // (__mips == 64)
+
+#define SH(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint16_t val_m = (val); \
+ \
+ asm volatile("sh %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SW(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint32_t val_m = (val); \
+ \
+ asm volatile("sw %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SD(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint64_t val_m = (val); \
+ \
+ asm volatile("sd %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+#else // !(__mips_isa_rev >= 6)
+#define LW(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint32_t val_m; \
+ \
+ asm volatile( \
+ "lwr %[val_m], 0(%[psrc_m]) \n\t" \
+ "lwl %[val_m], 3(%[psrc_m]) \n\t" \
+ : [val_m] "=&r"(val_m) \
+ : [psrc_m] "r"(psrc_m)); \
+ \
+ val_m; \
+ })
+
+#if (__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint64_t val_m = 0; \
+ \
+ asm volatile( \
+ "ldr %[val_m], 0(%[psrc_m]) \n\t" \
+ "ldl %[val_m], 7(%[psrc_m]) \n\t" \
+ : [val_m] "=&r"(val_m) \
+ : [psrc_m] "r"(psrc_m)); \
+ \
+ val_m; \
+ })
+#else // !(__mips == 64)
+#define LD(psrc) \
+ ({ \
+ const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
+ uint32_t val0_m, val1_m; \
+ uint64_t val_m = 0; \
+ \
+ val0_m = LW(psrc_m1); \
+ val1_m = LW(psrc_m1 + 4); \
+ \
+ val_m = (uint64_t)(val1_m); \
+ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
+ val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
+ \
+ val_m; \
+ })
+#endif // (__mips == 64)
+#define SH(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint16_t val_m = (val); \
+ \
+ asm volatile("ush %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SW(val, pdst) \
+ { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint32_t val_m = (val); \
+ \
+ asm volatile("usw %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m"(*pdst_m) \
+ : [val_m] "r"(val_m)); \
+ }
+
+#define SD(val, pdst) \
+ { \
+ uint8_t *pdst_m1 = (uint8_t *)(pdst); \
+ uint32_t val0_m, val1_m; \
+ \
+ val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
+ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ \
+ SW(val0_m, pdst_m1); \
+ SW(val1_m, pdst_m1 + 4); \
+ }
+#endif // (__mips_isa_rev >= 6)
+
+/* Description : Load 4 words with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1, out2, out3
+ Details : Load word in 'out0' from (psrc)
+ Load word in 'out1' from (psrc + stride)
+ Load word in 'out2' from (psrc + 2 * stride)
+ Load word in 'out3' from (psrc + 3 * stride)
+*/
+#define LW4(psrc, stride, out0, out1, out2, out3) \
+ { \
+ out0 = LW((psrc)); \
+ out1 = LW((psrc) + stride); \
+ out2 = LW((psrc) + 2 * stride); \
+ out3 = LW((psrc) + 3 * stride); \
+ }
+
+/* Description : Load double words with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Details : Load double word in 'out0' from (psrc)
+ Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) \
+ { \
+ out0 = LD((psrc)); \
+ out1 = LD((psrc) + stride); \
+ }
+#define LD4(psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD2((psrc), stride, out0, out1); \
+ LD2((psrc) + 2 * stride, stride, out2, out3); \
+ }
+
+/* Description : Store 4 words with stride
+ Arguments : Inputs - in0, in1, in2, in3, pdst, stride
+ Details : Store word from 'in0' to (pdst)
+ Store word from 'in1' to (pdst + stride)
+ Store word from 'in2' to (pdst + 2 * stride)
+ Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride) \
+ { \
+ SW(in0, (pdst)); \
+ SW(in1, (pdst) + stride); \
+ SW(in2, (pdst) + 2 * stride); \
+ SW(in3, (pdst) + 3 * stride); \
+ }
+
+/* Description : Store 4 double words with stride
+ Arguments : Inputs - in0, in1, in2, in3, pdst, stride
+ Details : Store double word from 'in0' to (pdst)
+ Store double word from 'in1' to (pdst + stride)
+ Store double word from 'in2' to (pdst + 2 * stride)
+ Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride) \
+ { \
+ SD(in0, (pdst)); \
+ SD(in1, (pdst) + stride); \
+ SD(in2, (pdst) + 2 * stride); \
+ SD(in3, (pdst) + 3 * stride); \
+ }
+
+/* Description : Load vectors with 16 byte elements with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Load 16 byte elements in 'out0' from (psrc)
+ Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+ { \
+ out0 = LD_B(RTYPE, (psrc)); \
+ out1 = LD_B(RTYPE, (psrc) + stride); \
+ }
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
+ { \
+ LD_B2(RTYPE, (psrc), stride, out0, out1); \
+ out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
+ }
+#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+#define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD_B2(RTYPE, (psrc), stride, out0, out1); \
+ LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+ }
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+
+#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
+ { \
+ LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
+ }
+#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+
+#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+ out7) \
+ { \
+ LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
+ }
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
+/* Description : Load vectors with 8 halfword elements with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Details : Load 8 halfword elements in 'out0' from (psrc)
+ Load 8 halfword elements in 'out1' from (psrc + stride)
+*/
+#define LD_H2(RTYPE, psrc, stride, out0, out1) \
+ { \
+ out0 = LD_H(RTYPE, (psrc)); \
+ out1 = LD_H(RTYPE, (psrc) + (stride)); \
+ }
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
+
+#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+ { \
+ LD_H2(RTYPE, (psrc), stride, out0, out1); \
+ LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+ }
+#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+
+/* Description : Load 2 vectors of signed word elements with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1) \
+ { \
+ out0 = LD_SW((psrc)); \
+ out1 = LD_SW((psrc) + stride); \
+ }
+
+/* Description : Store vectors of 16 byte elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 16 byte elements from 'in0' to (pdst)
+ Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+ { \
+ ST_B(RTYPE, in0, (pdst)); \
+ ST_B(RTYPE, in1, (pdst) + stride); \
+ }
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
+ { \
+ ST_B2(RTYPE, in0, in1, (pdst), stride); \
+ ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+ }
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+ { \
+ ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
+ ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
+ }
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 8 halfword elements from 'in0' to (pdst)
+ Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+ { \
+ ST_H(RTYPE, in0, (pdst)); \
+ ST_H(RTYPE, in1, (pdst) + stride); \
+ }
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+/* Description : Store vectors of word elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 4 word elements from 'in0' to (pdst)
+ Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride) \
+ { \
+ ST_SW(in0, (pdst)); \
+ ST_SW(in1, (pdst) + stride); \
+ }
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+ Arguments : Inputs - in, stidx, pdst, stride
+ Details : Index 'stidx' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst)
+ Index 'stidx+1' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + stride)
+ Index 'stidx+2' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + 2 * stride)
+ Index 'stidx+3' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride) \
+ { \
+ uint16_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \
+ out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
+ out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
+ out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
+ \
+ SH(out0_m, pblk_2x4_m); \
+ SH(out1_m, pblk_2x4_m + stride); \
+ SH(out2_m, pblk_2x4_m + 2 * stride); \
+ SH(out3_m, pblk_2x4_m + 3 * stride); \
+ }
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : 'Idx0' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst)
+ 'Idx1' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + stride)
+ 'Idx2' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + 2 * stride)
+ 'Idx3' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
+ { \
+ uint32_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_w((v4i32)in0, idx0); \
+ out1_m = __msa_copy_u_w((v4i32)in0, idx1); \
+ out2_m = __msa_copy_u_w((v4i32)in1, idx2); \
+ out3_m = __msa_copy_u_w((v4i32)in1, idx3); \
+ \
+ SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
+ }
+#define ST4x8_UB(in0, in1, pdst, stride) \
+ { \
+ uint8_t *pblk_4x8 = (uint8_t *)(pdst); \
+ \
+ ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
+ ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
+ }
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst
+ Details : Index 0 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst)
+*/
+#define ST8x1_UB(in, pdst) \
+ { \
+ uint64_t out0_m; \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in, 0); \
+ SD(out0_m, pdst); \
+ }
+
+/* Description : Store 8x2 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst, stride
+ Details : Index 0 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst)
+ Index 1 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride) \
+ { \
+ uint64_t out0_m, out1_m; \
+ uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in, 1); \
+ \
+ SD(out0_m, pblk_8x2_m); \
+ SD(out1_m, pblk_8x2_m + stride); \
+ }
+
+/* Description : Store 8x4 byte block to destination memory from input
+ vectors
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Index 0 double word element from 'in0' vector is copied to the
+ GP register and stored to (pdst)
+ Index 1 double word element from 'in0' vector is copied to the
+ GP register and stored to (pdst + stride)
+ Index 0 double word element from 'in1' vector is copied to the
+ GP register and stored to (pdst + 2 * stride)
+ Index 1 double word element from 'in1' vector is copied to the
+ GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride) \
+ { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in0, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in0, 1); \
+ out2_m = __msa_copy_u_d((v2i64)in1, 0); \
+ out3_m = __msa_copy_u_d((v2i64)in1, 1); \
+ \
+ SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
+ }
+
+/* Description : Immediate number of elements to slide with zero
+ Arguments : Inputs - in0, in1, slide_val
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'zero_m' vector are slid into 'in0' by
+ value specified in the 'slide_val'
+*/
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
+ { \
+ v16i8 zero_m = { 0 }; \
+ \
+ out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
+ out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
+ }
+#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide
+ Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by
+ value specified in the 'slide_val'
+*/
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
+ { \
+ out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \
+ out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
+ }
+
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
+ out2, slide_val) \
+ { \
+ SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val); \
+ out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \
+ }
+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+ Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'in0' & 'in1' are copied selectively to
+ 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+ }
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+
+#define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
+ out0, out1, out2) \
+ { \
+ VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
+ out2 = (RTYPE)__msa_vshf_b((v16i8)mask2, (v16i8)in5, (v16i8)in4); \
+ }
+#define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
+
+/* Description : Shuffle halfword vector elements as per mask vector
+ Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : halfword elements from 'in0' & 'in1' are copied selectively to
+ 'out0' as per control vector 'mask0'
+*/
+#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \
+ out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \
+ }
+#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Unsigned byte elements from 'mult0' are multiplied with
+ unsigned byte elements from 'cnst0' producing a result
+ twice the size of input i.e. unsigned halfword.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \
+ out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \
+ }
+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
+
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed byte elements from 'mult0' are multiplied with
+ signed byte elements from 'cnst0' producing a result
+ twice the size of input i.e. signed halfword.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
+ }
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed halfword elements from 'mult0' are multiplied with
+ signed halfword elements from 'cnst0' producing a result
+ twice the size of input i.e. signed word.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
+ }
+
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of word vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed word elements from 'mult0' are multiplied with
+ signed word elements from 'cnst0' producing a result
+ twice the size of input i.e. signed double word.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
+ }
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
+/* Description : Dot product & addition of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed byte elements from 'mult0' are multiplied with
+ signed byte elements from 'cnst0' producing a result
+ twice the size of input i.e. signed halfword.
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
+ out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
+ }
+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
+
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product & addition of halfword vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed halfword elements from 'mult0' are multiplied with
+ signed halfword elements from 'cnst0' producing a result
+ twice the size of input i.e. signed word.
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
+ out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
+ }
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
+#define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+ cnst3, out0, out1, out2, out3) \
+ { \
+ DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+ }
+#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product & addition of double word vector elements
+ Arguments : Inputs - mult0, mult1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each signed word element from 'mult0' is multiplied with itself
+ producing an intermediate result twice the size of it
+ i.e. signed double word
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
+ out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
+ }
+#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+ between 0 & 255
+ Arguments : Input - in
+ Output - out_m
+ Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in) \
+ ({ \
+ v8i16 max_m = __msa_ldi_h(255); \
+ v8i16 out_m; \
+ \
+ out_m = __msa_maxi_s_h((v8i16)in, 0); \
+ out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
+ out_m; \
+ })
+#define CLIP_SH2_0_255(in0, in1) \
+ { \
+ in0 = CLIP_SH_0_255(in0); \
+ in1 = CLIP_SH_0_255(in1); \
+ }
+#define CLIP_SH4_0_255(in0, in1, in2, in3) \
+ { \
+ CLIP_SH2_0_255(in0, in1); \
+ CLIP_SH2_0_255(in2, in3); \
+ }
+
+/* Description : Clips all signed word elements of input vector
+ between 0 & 255
+ Arguments : Input - in
+ Output - out_m
+ Return Type - signed word
+*/
+#define CLIP_SW_0_255(in) \
+ ({ \
+ v4i32 max_m = __msa_ldi_w(255); \
+ v4i32 out_m; \
+ \
+ out_m = __msa_maxi_s_w((v4i32)in, 0); \
+ out_m = __msa_min_s_w((v4i32)max_m, (v4i32)out_m); \
+ out_m; \
+ })
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+ Arguments : Input - in (signed word vector)
+ Output - sum_m (i32 sum)
+ Return Type - signed word (GP)
+ Details : 4 signed word elements of 'in' vector are added together and
+ the resulting integer sum is returned
+*/
+#define HADD_SW_S32(in) \
+ ({ \
+ v2i64 res0_m, res1_m; \
+ int32_t sum_m; \
+ \
+ res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
+ res1_m = __msa_splati_d(res0_m, 1); \
+ res0_m = res0_m + res1_m; \
+ sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \
+ sum_m; \
+ })
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+ Arguments : Inputs - in (unsigned halfword vector)
+ Outputs - sum_m (u32 sum)
+ Return Type - unsigned word
+ Details : 8 unsigned halfword elements of input vector are added
+ together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in) \
+ ({ \
+ v4u32 res_m; \
+ v2u64 res0_m, res1_m; \
+ uint32_t sum_m; \
+ \
+ res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
+ res0_m = __msa_hadd_u_d(res_m, res_m); \
+ res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
+ res0_m = res0_m + res1_m; \
+ sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \
+ sum_m; \
+ })
+
+/* Description : Horizontal addition of unsigned byte vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each unsigned odd byte element from 'in0' is added to
+ even unsigned byte element from 'in0' (pairwise) and the
+ halfword result is written to 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
+ out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
+ }
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each unsigned odd byte element from 'in0' is subtracted from
+ even unsigned byte element from 'in0' (pairwise) and the
+ halfword result is written to 'out0'
+*/
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
+ out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
+ }
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of signed halfword vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each signed odd halfword element from 'in0' is subtracted from
+ even signed halfword element from 'in0' (pairwise) and the
+ word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
+ out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
+ }
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
+/* Description : Set element n input vector to GPR value
+ Arguments : Inputs - in0, in1, in2, in3
+ Output - out
+ Return Type - as per RTYPE
+ Details : Set element 0 in vector 'out' to value specified in 'in0'
+*/
+#define INSERT_D2(RTYPE, in0, in1, out) \
+ { \
+ out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
+ out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
+ }
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even byte elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+ }
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even halfword elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
+ out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
+ }
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
+ out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
+ }
+#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even double word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
+ out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
+ }
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of byte elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
+ }
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
+
+/* Description : Interleave left half of halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of halfword elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
+ }
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
+ }
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of byte elements of 'in0' and 'in1' are interleaved
+ and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
+ }
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of halfword elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
+ }
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
+ }
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of double word elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
+ out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
+ }
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of byte elements from 'in0' and 'in1' are
+ interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+ }
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+ }
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+ }
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Maximum values between signed elements of vector and
+ 5-bit signed immediate value are copied to the output vector
+ Arguments : Inputs - in0, in1, in2, in3, max_val
+ Outputs - in place operation
+ Return Type - unsigned halfword
+ Details : Maximum of signed halfword element values from 'in0' and
+ 'max_val' are written in place
+*/
+#define MAXI_SH2(RTYPE, in0, in1, max_val) \
+ { \
+ in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val)); \
+ in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val)); \
+ }
+#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+ unsigned value of (sat_val + 1) bits
+ The element data width remains unchanged
+ Arguments : Inputs - in0, in1, sat_val
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned halfword element from 'in0' is saturated to the
+ value generated with (sat_val + 1) bit range.
+ The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val) \
+ { \
+ in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
+ in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
+ }
+#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+ unsigned value of (sat_val + 1) bits
+ The element data width remains unchanged
+ Arguments : Inputs - in0, in1, sat_val
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned halfword element from 'in0' is saturated to the
+ value generated with (sat_val + 1) bit range
+ The results are written in place
+*/
+#define SAT_SH2(RTYPE, in0, in1, sat_val) \
+ { \
+ in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
+ in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
+ }
+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
+
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
+ { \
+ SAT_SH2(RTYPE, in0, in1, sat_val); \
+ SAT_SH2(RTYPE, in2, in3, sat_val); \
+ }
+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Indexed halfword element values are replicated to all
+ elements in output vector
+ Arguments : Inputs - in, idx0, idx1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : 'idx0' element value from 'in' vector is replicated to all
+ elements in 'out0' vector
+ Valid index range for halfword operation is 0-7
+*/
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \
+ out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \
+ }
+#define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
+
+#define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, out0, out1, out2) \
+ { \
+ SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
+ out2 = (RTYPE)__msa_splati_h((v8i16)in, idx2); \
+ }
+#define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
+#define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
+
+/* Description : Indexed word element values are replicated to all
+ elements in output vector
+ Arguments : Inputs - in, stidx
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : 'stidx' element value from 'in' vector is replicated to all
+ elements in 'out0' vector
+ 'stidx + 1' element value from 'in' vector is replicated to all
+ elements in 'out1' vector
+ Valid index range for word operation is 0-3
+*/
+#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx); \
+ out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx + 1)); \
+ }
+#define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even byte elements of 'in0' are copied to the left half of
+ 'out0' & even byte elements of 'in1' are copied to the right
+ half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
+ }
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even halfword elements of 'in0' are copied to the left half of
+ 'out0' & even halfword elements of 'in1' are copied to the
+ right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
+ }
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3) \
+ { \
+ PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
+ }
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even double elements of 'in0' are copied to the left half of
+ 'out0' & even double elements of 'in1' are copied to the right
+ half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
+ out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
+ }
+#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
+
+/* Description : Pack odd double word elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Odd double word elements of 'in0' are copied to the left half
+ of 'out0' & odd double word elements of 'in1' are copied to
+ the right half of 'out0'.
+*/
+#define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = (RTYPE)__msa_pckod_d((v2i64)in0, (v2i64)in1); \
+ out1 = (RTYPE)__msa_pckod_d((v2i64)in2, (v2i64)in3); \
+ }
+#define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
+#define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
+
+/* Description : Each byte element is logically xor'ed with immediate 128
+ Arguments : Inputs - in0, in1
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned byte element from input vector 'in0' is
+ logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128(RTYPE, in0, in1) \
+ { \
+ in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
+ in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
+ }
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+
+#define XORI_B3_128(RTYPE, in0, in1, in2) \
+ { \
+ XORI_B2_128(RTYPE, in0, in1); \
+ in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
+ }
+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
+
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
+ { \
+ XORI_B2_128(RTYPE, in0, in1); \
+ XORI_B2_128(RTYPE, in2, in3); \
+ }
+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
+
+#define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
+ { \
+ XORI_B3_128(RTYPE, in0, in1, in2); \
+ XORI_B2_128(RTYPE, in3, in4); \
+ }
+#define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
+
+#define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
+ { \
+ XORI_B4_128(RTYPE, in0, in1, in2, in3); \
+ XORI_B4_128(RTYPE, in4, in5, in6, in7); \
+ }
+#define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
+
+/* Description : Shift left all elements of vector (generic for all data types)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - in place operation
+ Return Type - as per input vector RTYPE
+ Details : Each element of vector 'in0' is left shifted by 'shift' and
+ the result is written in-place.
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) \
+ { \
+ in0 = in0 << shift; \
+ in1 = in1 << shift; \
+ in2 = in2 << shift; \
+ in3 = in3 << shift; \
+ }
+
+/* Description : Arithmetic shift right all elements of vector
+ (generic for all data types)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - in place operation
+ Return Type - as per input vector RTYPE
+ Details : Each element of vector 'in0' is right shifted by 'shift' and
+ the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V(in0, in1, in2, in3, shift) \
+ { \
+ in0 = in0 >> shift; \
+ in1 = in1 >> shift; \
+ in2 = in2 >> shift; \
+ in3 = in3 >> shift; \
+ }
+
+/* Description : Shift right arithmetic rounded words
+ Arguments : Inputs - in0, in1, shift
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each element of vector 'in0' is shifted right arithmetically by
+ the number of bits in the corresponding element in the vector
+ 'shift'. The last discarded bit is added to shifted value for
+ rounding and the result is written in-place.
+ 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
+ in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
+ }
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRAR_W2(RTYPE, in0, in1, shift); \
+ SRAR_W2(RTYPE, in2, in3, shift); \
+ }
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
+/* Description : Shift right arithmetic rounded (immediate)
+ Arguments : Inputs - in0, in1, shift
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each element of vector 'in0' is shifted right arithmetically by
+ the value in 'shift'. The last discarded bit is added to the
+ shifted value for rounding and the result is written in-place.
+ 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
+ in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
+ }
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRARI_H2(RTYPE, in0, in1, shift); \
+ SRARI_H2(RTYPE, in2, in3, shift); \
+ }
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift) \
+ { \
+ in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
+ in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
+ }
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
+ { \
+ SRARI_W2(RTYPE, in0, in1, shift); \
+ SRARI_W2(RTYPE, in2, in3, shift); \
+ }
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element from 'in0' is multiplied with elements from 'in1'
+ and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 * in1; \
+ out1 = in2 * in3; \
+ }
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ MUL2(in0, in1, in2, in3, out0, out1); \
+ MUL2(in4, in5, in6, in7, out2, out3); \
+ }
+
+/* Description : Addition of 2 pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element in 'in0' is added to 'in1' and result is written
+ to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 + in1; \
+ out1 = in2 + in3; \
+ }
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ ADD2(in0, in1, in2, in3, out0, out1); \
+ ADD2(in4, in5, in6, in7, out2, out3); \
+ }
+
+/* Description : Subtraction of 2 pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element in 'in1' is subtracted from 'in0' and result is
+ written to 'out0'.
+*/
+#define SUB2(in0, in1, in2, in3, out0, out1) \
+ { \
+ out0 = in0 - in1; \
+ out1 = in2 - in3; \
+ }
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+ { \
+ out0 = in0 - in1; \
+ out1 = in2 - in3; \
+ out2 = in4 - in5; \
+ out3 = in6 - in7; \
+ }
+
+/* Description : Sign extend halfword elements from right half of the vector
+ Arguments : Input - in (halfword vector)
+ Output - out (sign extended word vector)
+ Return Type - signed word
+ Details : Sign bit of halfword elements from input vector 'in' is
+ extracted and interleaved with same vector 'in0' to generate
+ 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out) \
+ { \
+ v8i16 sign_m; \
+ \
+ sign_m = __msa_clti_s_h((v8i16)in, 0); \
+ out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
+ }
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+ Arguments : Input - in (unsigned byte vector)
+ Outputs - out0, out1 (unsigned halfword vectors)
+ Return Type - signed halfword
+ Details : Zero extended right half of vector is returned in 'out0'
+ Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH(in, out0, out1) \
+ { \
+ v16i8 zero_m = { 0 }; \
+ \
+ ILVRL_B2_SH(zero_m, in, out0, out1); \
+ }
+
+/* Description : Sign extend halfword elements from input vector and return
+ the result in pair of vectors
+ Arguments : Input - in (halfword vector)
+ Outputs - out0, out1 (sign extended word vectors)
+ Return Type - signed word
+ Details : Sign bit of halfword elements from input vector 'in' is
+ extracted and interleaved right with same vector 'in0' to
+ generate 4 signed word elements in 'out0'
+ Then interleaved left with same vector 'in0' to
+ generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1) \
+ { \
+ v8i16 tmp_m; \
+ \
+ tmp_m = __msa_clti_s_h((v8i16)in, 0); \
+ ILVRL_H2_SW(tmp_m, in, out0, out1); \
+ }
+
+/* Description : Butterfly of 4 input vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Details : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ out0 = in0 + in3; \
+ out1 = in1 + in2; \
+ \
+ out2 = in1 - in2; \
+ out3 = in0 - in3; \
+ }
+
+/* Description : Transpose input 8x8 byte block
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+ out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
+ tmp3_m); \
+ ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
+ ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
+ ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
+ ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
+ SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
+ SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
+ }
+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
+
+/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15
+ Outputs - out0, out1, out2, out3
+ Return Type - unsigned byte
+*/
+#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
+ in10, in11, in12, in13, in14, in15, out0, out1, \
+ out2, out3) \
+ { \
+ v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
+ out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \
+ \
+ ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
+ out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \
+ \
+ ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
+ \
+ tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
+ ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
+ \
+ tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
+ ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
+ out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
+ out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
+ \
+ tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1); \
+ tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m); \
+ out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
+ out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
+ }
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
+ in10, in11, in12, in13, in14, in15, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
+ ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
+ ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
+ ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
+ \
+ tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \
+ tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \
+ tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \
+ tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \
+ out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \
+ tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \
+ out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \
+ tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \
+ \
+ ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
+ out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \
+ out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
+ out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
+ out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ }
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 s0_m, s1_m; \
+ \
+ ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
+ ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \
+ }
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \
+ ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \
+ ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
+ ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
+ }
+
+/* Description : Transpose 4x4 block with word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
+ ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
+ \
+ out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \
+ out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \
+ out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \
+ out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \
+ }
+
+/* Description : Dot product and addition of 3 signed halfword input vectors
+ Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
+ Output - out0_m
+ Return Type - signed halfword
+ Details : Dot product of 'in0' with 'coeff0'
+ Dot product of 'in1' with 'coeff1'
+ Dot product of 'in2' with 'coeff2'
+ Addition of all the 3 vector results
+ out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
+*/
+#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
+ ({ \
+ v8i16 tmp1_m; \
+ v8i16 out0_m; \
+ \
+ out0_m = __msa_dotp_s_h((v16i8)in0, (v16i8)coeff0); \
+ out0_m = __msa_dpadd_s_h(out0_m, (v16i8)in1, (v16i8)coeff1); \
+ tmp1_m = __msa_dotp_s_h((v16i8)in2, (v16i8)coeff2); \
+ out0_m = __msa_adds_s_h(out0_m, tmp1_m); \
+ \
+ out0_m; \
+ })
+
+/* Description : Pack even elements of input vectors & xor with 128
+ Arguments : Inputs - in0, in1
+ Output - out_m
+ Return Type - unsigned byte
+ Details : Signed byte even elements from 'in0' and 'in1' are packed
+ together in one vector and the resulting vector is xor'ed with
+ 128 to shift the range from signed to unsigned byte
+*/
+#define PCKEV_XORI128_UB(in0, in1) \
+ ({ \
+ v16u8 out_m; \
+ out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
+ out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \
+ out_m; \
+ })
+
+/* Description : Pack even byte elements and store byte vector in destination
+ memory
+ Arguments : Inputs - in0, in1, pdst
+*/
+#define PCKEV_ST_SB(in0, in1, pdst) \
+ { \
+ v16i8 tmp_m; \
+ tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
+ ST_SB(tmp_m, (pdst)); \
+ }
+
+/* Description : Horizontal 2 tap filter kernel code
+ Arguments : Inputs - in0, in1, mask, coeff, shift
+*/
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
+ ({ \
+ v16i8 tmp0_m; \
+ v8u16 tmp1_m; \
+ \
+ tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
+ tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \
+ tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \
+ \
+ tmp1_m; \
+ })
+#endif // VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
diff --git a/media/libvpx/libvpx/vp8/common/modecont.c b/media/libvpx/libvpx/vp8/common/modecont.c
new file mode 100644
index 0000000000..bab410374f
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/modecont.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "entropy.h"
+
+const int vp8_mode_contexts[6][4] = {
+ { /* 0 */
+ 7, 1, 1, 143 },
+ { /* 1 */
+ 14, 18, 14, 107 },
+ { /* 2 */
+ 135, 64, 57, 68 },
+ { /* 3 */
+ 60, 56, 128, 65 },
+ { /* 4 */
+ 159, 134, 128, 34 },
+ { /* 5 */
+ 234, 188, 128, 28 },
+};
diff --git a/media/libvpx/libvpx/vp8/common/modecont.h b/media/libvpx/libvpx/vp8/common/modecont.h
new file mode 100644
index 0000000000..031f74f2ff
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/modecont.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_MODECONT_H_
+#define VPX_VP8_COMMON_MODECONT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const int vp8_mode_contexts[6][4];
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_MODECONT_H_
diff --git a/media/libvpx/libvpx/vp8/common/mv.h b/media/libvpx/libvpx/vp8/common/mv.h
new file mode 100644
index 0000000000..4cde12f201
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mv.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_MV_H_
+#define VPX_VP8_COMMON_MV_H_
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ short row;
+ short col;
+} MV;
+
+typedef union int_mv {
+ uint32_t as_int;
+ MV as_mv;
+} int_mv; /* facilitates faster equality tests and copies */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_MV_H_
diff --git a/media/libvpx/libvpx/vp8/common/onyx.h b/media/libvpx/libvpx/vp8/common/onyx.h
new file mode 100644
index 0000000000..8c35e433e7
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/onyx.h
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_ONYX_H_
+#define VPX_VP8_COMMON_ONYX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+
+struct VP8_COMP;
+
+/* Create/destroy static data structures. */
+
+typedef enum {
+ USAGE_LOCAL_FILE_PLAYBACK = 0x0,
+ USAGE_STREAM_FROM_SERVER = 0x1,
+ USAGE_CONSTRAINED_QUALITY = 0x2,
+ USAGE_CONSTANT_QUALITY = 0x3
+} END_USAGE;
+
+typedef enum {
+ MODE_REALTIME = 0x0,
+ MODE_GOODQUALITY = 0x1,
+ MODE_BESTQUALITY = 0x2,
+ MODE_FIRSTPASS = 0x3,
+ MODE_SECONDPASS = 0x4,
+ MODE_SECONDPASS_BEST = 0x5
+} MODE;
+
+typedef enum {
+ FRAMEFLAGS_KEY = 1,
+ FRAMEFLAGS_GOLDEN = 2,
+ FRAMEFLAGS_ALTREF = 4
+} FRAMETYPE_FLAGS;
+
+#include <assert.h>
+static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
+ switch (mode) {
+ case VP8E_NORMAL:
+ *hr = 1;
+ *hs = 1;
+ break;
+ case VP8E_FOURFIVE:
+ *hr = 4;
+ *hs = 5;
+ break;
+ case VP8E_THREEFIVE:
+ *hr = 3;
+ *hs = 5;
+ break;
+ case VP8E_ONETWO:
+ *hr = 1;
+ *hs = 2;
+ break;
+ default:
+ *hr = 1;
+ *hs = 1;
+ assert(0);
+ break;
+ }
+}
+
+typedef struct {
+ /* 4 versions of bitstream defined:
+ * 0 best quality/slowest decode, 3 lowest quality/fastest decode
+ */
+ int Version;
+ int Width;
+ int Height;
+ struct vpx_rational timebase;
+ unsigned int target_bandwidth; /* kilobits per second */
+
+ /* Parameter used for applying denoiser.
+ * For temporal denoiser: noise_sensitivity = 0 means off,
+ * noise_sensitivity = 1 means temporal denoiser on for Y channel only,
+ * noise_sensitivity = 2 means temporal denoiser on for all channels.
+ * noise_sensitivity = 3 means aggressive denoising mode.
+ * noise_sensitivity >= 4 means adaptive denoising mode.
+ * Temporal denoiser is enabled via the configuration option:
+ * CONFIG_TEMPORAL_DENOISING.
+ * For spatial denoiser: noise_sensitivity controls the amount of
+ * pre-processing blur: noise_sensitivity = 0 means off.
+ * Spatial denoiser invoked under !CONFIG_TEMPORAL_DENOISING.
+ */
+ int noise_sensitivity;
+
+ /* parameter used for sharpening output: recommendation 0: */
+ int Sharpness;
+ int cpu_used;
+ unsigned int rc_max_intra_bitrate_pct;
+ /* percent of rate boost for golden frame in CBR mode. */
+ unsigned int gf_cbr_boost_pct;
+ unsigned int screen_content_mode;
+
+ /* mode ->
+ *(0)=Realtime/Live Encoding. This mode is optimized for realtim
+ * encoding (for example, capturing a television signal or feed
+ * from a live camera). ( speed setting controls how fast )
+ *(1)=Good Quality Fast Encoding. The encoder balances quality with
+ * the amount of time it takes to encode the output. ( speed
+ * setting controls how fast )
+ *(2)=One Pass - Best Quality. The encoder places priority on the
+ * quality of the output over encoding speed. The output is
+ * compressed at the highest possible quality. This option takes
+ * the longest amount of time to encode. ( speed setting ignored
+ * )
+ *(3)=Two Pass - First Pass. The encoder generates a file of
+ * statistics for use in the second encoding pass. ( speed
+ * setting controls how fast )
+ *(4)=Two Pass - Second Pass. The encoder uses the statistics that
+ * were generated in the first encoding pass to create the
+ * compressed output. ( speed setting controls how fast )
+ *(5)=Two Pass - Second Pass Best. The encoder uses the statistics
+ * that were generated in the first encoding pass to create the
+ * compressed output using the highest possible quality, and
+ * taking a longer amount of time to encode.. ( speed setting
+ * ignored )
+ */
+ int Mode;
+
+ /* Key Framing Operations */
+ int auto_key; /* automatically detect cut scenes */
+ int key_freq; /* maximum distance to key frame. */
+
+ /* lagged compression (if allow_lag == 0 lag_in_frames is ignored) */
+ int allow_lag;
+ int lag_in_frames; /* how many frames lag before we start encoding */
+
+ /*
+ * DATARATE CONTROL OPTIONS
+ */
+
+ int end_usage; /* vbr or cbr */
+
+ /* buffer targeting aggressiveness */
+ int under_shoot_pct;
+ int over_shoot_pct;
+
+ /* buffering parameters */
+ int64_t starting_buffer_level;
+ int64_t optimal_buffer_level;
+ int64_t maximum_buffer_size;
+
+ int64_t starting_buffer_level_in_ms;
+ int64_t optimal_buffer_level_in_ms;
+ int64_t maximum_buffer_size_in_ms;
+
+ /* controlling quality */
+ int fixed_q;
+ int worst_allowed_q;
+ int best_allowed_q;
+ int cq_level;
+
+ /* allow internal resizing */
+ int allow_spatial_resampling;
+ int resample_down_water_mark;
+ int resample_up_water_mark;
+
+ /* allow internal frame rate alterations */
+ int allow_df;
+ int drop_frames_water_mark;
+
+ /* two pass datarate control */
+ int two_pass_vbrbias;
+ int two_pass_vbrmin_section;
+ int two_pass_vbrmax_section;
+
+ /*
+ * END DATARATE CONTROL OPTIONS
+ */
+
+ /* these parameters aren't to be used in final build don't use!!! */
+ int play_alternate;
+ int alt_freq;
+ int alt_q;
+ int key_q;
+ int gold_q;
+
+ int multi_threaded; /* how many threads to run the encoder on */
+ int token_partitions; /* how many token partitions to create */
+
+ /* early breakout threshold: for video conf recommend 800 */
+ int encode_breakout;
+
+ /* Bitfield defining the error resiliency features to enable.
+ * Can provide decodable frames after losses in previous
+ * frames and decodable partitions after losses in the same frame.
+ */
+ unsigned int error_resilient_mode;
+
+ int arnr_max_frames;
+ int arnr_strength;
+ int arnr_type;
+
+ vpx_fixed_buf_t two_pass_stats_in;
+ struct vpx_codec_pkt_list *output_pkt_list;
+
+ vp8e_tuning tuning;
+
+ /* Temporal scaling parameters */
+ unsigned int number_of_layers;
+ unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY];
+ unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY];
+ unsigned int periodicity;
+ unsigned int layer_id[VPX_TS_MAX_PERIODICITY];
+
+#if CONFIG_MULTI_RES_ENCODING
+ /* Number of total resolutions encoded */
+ unsigned int mr_total_resolutions;
+
+ /* Current encoder ID */
+ unsigned int mr_encoder_id;
+
+ /* Down-sampling factor */
+ vpx_rational_t mr_down_sampling_factor;
+
+ /* Memory location to store low-resolution encoder's mode info */
+ void *mr_low_res_mode_info;
+#endif
+} VP8_CONFIG;
+
+void vp8_initialize();
+
+struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf);
+void vp8_remove_compressor(struct VP8_COMP **comp);
+
+void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf);
+void vp8_change_config(struct VP8_COMP *cpi, VP8_CONFIG *oxcf);
+
+int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time);
+int vp8_get_compressed_data(struct VP8_COMP *cpi, unsigned int *frame_flags,
+ size_t *size, unsigned char *dest,
+ unsigned char *dest_end, int64_t *time_stamp,
+ int64_t *time_end, int flush);
+int vp8_get_preview_raw_frame(struct VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+ vp8_ppflags_t *flags);
+
+int vp8_use_as_reference(struct VP8_COMP *cpi, int ref_frame_flags);
+int vp8_update_reference(struct VP8_COMP *cpi, int ref_frame_flags);
+int vp8_get_reference(struct VP8_COMP *cpi,
+ enum vpx_ref_frame_type ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+int vp8_set_reference(struct VP8_COMP *cpi,
+ enum vpx_ref_frame_type ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+int vp8_update_entropy(struct VP8_COMP *cpi, int update);
+int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows,
+ unsigned int cols, int delta_q[4], int delta_lf[4],
+ unsigned int threshold[4]);
+int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map,
+ unsigned int rows, unsigned int cols);
+int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+ VPX_SCALING_MODE vert_mode);
+int vp8_get_quantizer(struct VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // VPX_VP8_COMMON_ONYX_H_
diff --git a/media/libvpx/libvpx/vp8/common/onyxc_int.h b/media/libvpx/libvpx/vp8/common/onyxc_int.h
new file mode 100644
index 0000000000..ef8d007620
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/onyxc_int.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_ONYXC_INT_H_
+#define VPX_VP8_COMMON_ONYXC_INT_H_
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "loopfilter.h"
+#include "entropymv.h"
+#include "entropy.h"
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif
+
+/*#ifdef PACKET_TESTING*/
+#include "header.h"
+/*#endif*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MINQ 0
+#define MAXQ 127
+#define QINDEX_RANGE (MAXQ + 1)
+
+#define NUM_YV12_BUFFERS 4
+
+#define MAX_PARTITIONS 9
+
+typedef struct frame_contexts {
+ vp8_prob bmode_prob[VP8_BINTRAMODES - 1];
+ vp8_prob ymode_prob[VP8_YMODES - 1]; /* interframe intra mode probs */
+ vp8_prob uv_mode_prob[VP8_UV_MODES - 1];
+ vp8_prob sub_mv_ref_prob[VP8_SUBMVREFS - 1];
+ vp8_prob coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES];
+ MV_CONTEXT mvc[2];
+} FRAME_CONTEXT;
+
+typedef enum {
+ ONE_PARTITION = 0,
+ TWO_PARTITION = 1,
+ FOUR_PARTITION = 2,
+ EIGHT_PARTITION = 3
+} TOKEN_PARTITION;
+
+typedef enum {
+ RECON_CLAMP_REQUIRED = 0,
+ RECON_CLAMP_NOTREQUIRED = 1
+} CLAMP_TYPE;
+
+typedef struct VP8Common {
+ struct vpx_internal_error_info error;
+
+ DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]);
+ DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]);
+ DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]);
+
+ int Width;
+ int Height;
+ int horiz_scale;
+ int vert_scale;
+
+ CLAMP_TYPE clamp_type;
+
+ YV12_BUFFER_CONFIG *frame_to_show;
+
+ YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
+ int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
+ int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
+
+ YV12_BUFFER_CONFIG temp_scale_frame;
+
+#if CONFIG_POSTPROC
+ YV12_BUFFER_CONFIG post_proc_buffer;
+ YV12_BUFFER_CONFIG post_proc_buffer_int;
+ int post_proc_buffer_int_used;
+ unsigned char *pp_limits_buffer; /* post-processing filter coefficients */
+#endif
+
+ FRAME_TYPE
+ last_frame_type; /* Save last frame's frame type for motion search. */
+ FRAME_TYPE frame_type;
+
+ int show_frame;
+
+ int frame_flags;
+ int MBs;
+ int mb_rows;
+ int mb_cols;
+ int mode_info_stride;
+
+ /* profile settings */
+ int mb_no_coeff_skip;
+ int no_lpf;
+ int use_bilinear_mc_filter;
+ int full_pixel;
+
+ int base_qindex;
+
+ int y1dc_delta_q;
+ int y2dc_delta_q;
+ int y2ac_delta_q;
+ int uvdc_delta_q;
+ int uvac_delta_q;
+
+ /* We allocate a MODE_INFO struct for each macroblock, together with
+ an extra row on top and column on the left to simplify prediction. */
+
+ MODE_INFO *mip; /* Base of allocated array */
+ MODE_INFO *mi; /* Corresponds to upper left visible macroblock */
+#if CONFIG_ERROR_CONCEALMENT
+ MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
+ MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */
+#endif
+ /* MODE_INFO for the last decoded frame to show */
+ MODE_INFO *show_frame_mi;
+ LOOPFILTERTYPE filter_type;
+
+ loop_filter_info_n lf_info;
+
+ int filter_level;
+ int last_sharpness_level;
+ int sharpness_level;
+
+ int refresh_last_frame; /* Two state 0 = NO, 1 = YES */
+ int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */
+ int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */
+
+ int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */
+ int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */
+
+ int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */
+
+ int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */
+
+ /* Y,U,V,Y2 */
+ ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */
+ ENTROPY_CONTEXT_PLANES left_context; /* (up to) 4 contexts "" */
+
+ FRAME_CONTEXT lfc; /* last frame entropy */
+ FRAME_CONTEXT fc; /* this frame entropy */
+
+ unsigned int current_video_frame;
+
+ int version;
+
+ TOKEN_PARTITION multi_token_partition;
+
+#ifdef PACKET_TESTING
+ VP8_HEADER oh;
+#endif
+
+#if CONFIG_MULTITHREAD
+ int processor_core_count;
+#endif
+#if CONFIG_POSTPROC
+ struct postproc_state postproc_state;
+#endif
+ int cpu_caps;
+} VP8_COMMON;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_ONYXC_INT_H_
diff --git a/media/libvpx/libvpx/vp8/common/onyxd.h b/media/libvpx/libvpx/vp8/common/onyxd.h
new file mode 100644
index 0000000000..e4e81aaac5
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/onyxd.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_ONYXD_H_
+#define VPX_VP8_COMMON_ONYXD_H_
+
+/* Create/destroy static data structures. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vp8.h"
+
+struct VP8D_COMP;
+struct VP8Common;
+
+typedef struct {
+ int Width;
+ int Height;
+ int Version;
+ int postprocess;
+ int max_threads;
+ int error_concealment;
+} VP8D_CONFIG;
+
+typedef enum { VP8D_OK = 0 } VP8D_SETTING;
+
+void vp8dx_initialize(void);
+
+void vp8dx_set_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst, int x);
+
+int vp8dx_get_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst);
+
+int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi, int64_t time_stamp);
+int vp8dx_get_raw_frame(struct VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
+ int64_t *time_stamp, int64_t *time_end_stamp,
+ vp8_ppflags_t *flags);
+int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame);
+
+vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *pbi,
+ enum vpx_ref_frame_type ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *pbi,
+ enum vpx_ref_frame_type ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+int vp8dx_get_quantizer(const struct VP8D_COMP *pbi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // VPX_VP8_COMMON_ONYXD_H_
diff --git a/media/libvpx/libvpx/vp8/common/postproc.c b/media/libvpx/libvpx/vp8/common/postproc.c
new file mode 100644
index 0000000000..c03b16b2f5
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/postproc.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_dsp_rtcd.h"
+#include "vp8_rtcd.h"
+#include "vpx_dsp/postproc.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_scale_rtcd.h"
+#include "vpx_scale/yv12config.h"
+#include "postproc.h"
+#include "common.h"
+#include "vpx_scale/vpx_scale.h"
+#include "systemdependent.h"
+
+#include <limits.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+/* clang-format off */
+#define RGB_TO_YUV(t) \
+ (unsigned char)((0.257 * (float)(t >> 16)) + \
+ (0.504 * (float)(t >> 8 & 0xff)) + \
+ (0.098 * (float)(t & 0xff)) + 16), \
+ (unsigned char)(-(0.148 * (float)(t >> 16)) - \
+ (0.291 * (float)(t >> 8 & 0xff)) + \
+ (0.439 * (float)(t & 0xff)) + 128), \
+ (unsigned char)((0.439 * (float)(t >> 16)) - \
+ (0.368 * (float)(t >> 8 & 0xff)) - \
+ (0.071 * (float)(t & 0xff)) + 128)
+/* clang-format on */
+
+extern void vp8_blit_text(const char *msg, unsigned char *address,
+ const int pitch);
+extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
+ const int pitch);
+/***********************************************************************************************************
+ */
+#if CONFIG_POSTPROC
+static int q2mbl(int x) {
+ if (x < 20) x = 20;
+
+ x = 50 + (x - 50) * 10 / 8;
+ return x * x / 3;
+}
+
+static void vp8_de_mblock(YV12_BUFFER_CONFIG *post, int q) {
+ vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+ vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+}
+
+void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *post, int q) {
+ double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+ int ppl = (int)(level + .5);
+
+ const MODE_INFO *mode_info_context = cm->mi;
+ int mbr, mbc;
+
+ /* The pixel thresholds are adjusted according to if or not the macroblock
+ * is a skipped block. */
+ unsigned char *ylimits = cm->pp_limits_buffer;
+ unsigned char *uvlimits = cm->pp_limits_buffer + 16 * cm->mb_cols;
+
+ if (ppl > 0) {
+ for (mbr = 0; mbr < cm->mb_rows; ++mbr) {
+ unsigned char *ylptr = ylimits;
+ unsigned char *uvlptr = uvlimits;
+ for (mbc = 0; mbc < cm->mb_cols; ++mbc) {
+ unsigned char mb_ppl;
+
+ if (mode_info_context->mbmi.mb_skip_coeff) {
+ mb_ppl = (unsigned char)ppl >> 1;
+ } else {
+ mb_ppl = (unsigned char)ppl;
+ }
+
+ memset(ylptr, mb_ppl, 16);
+ memset(uvlptr, mb_ppl, 8);
+
+ ylptr += 16;
+ uvlptr += 8;
+ mode_info_context++;
+ }
+ mode_info_context++;
+
+ vpx_post_proc_down_and_across_mb_row(
+ source->y_buffer + 16 * mbr * source->y_stride,
+ post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
+ post->y_stride, source->y_width, ylimits, 16);
+
+ vpx_post_proc_down_and_across_mb_row(
+ source->u_buffer + 8 * mbr * source->uv_stride,
+ post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+ post->uv_stride, source->uv_width, uvlimits, 8);
+ vpx_post_proc_down_and_across_mb_row(
+ source->v_buffer + 8 * mbr * source->uv_stride,
+ post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+ post->uv_stride, source->uv_width, uvlimits, 8);
+ }
+ } else {
+ vp8_yv12_copy_frame(source, post);
+ }
+}
+
+void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, int q,
+ int uvfilter) {
+ int mbr;
+ double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+ int ppl = (int)(level + .5);
+ int mb_rows = cm->mb_rows;
+ int mb_cols = cm->mb_cols;
+ unsigned char *limits = cm->pp_limits_buffer;
+
+ memset(limits, (unsigned char)ppl, 16 * mb_cols);
+
+ /* TODO: The original code don't filter the 2 outer rows and columns. */
+ for (mbr = 0; mbr < mb_rows; ++mbr) {
+ vpx_post_proc_down_and_across_mb_row(
+ source->y_buffer + 16 * mbr * source->y_stride,
+ source->y_buffer + 16 * mbr * source->y_stride, source->y_stride,
+ source->y_stride, source->y_width, limits, 16);
+ if (uvfilter == 1) {
+ vpx_post_proc_down_and_across_mb_row(
+ source->u_buffer + 8 * mbr * source->uv_stride,
+ source->u_buffer + 8 * mbr * source->uv_stride, source->uv_stride,
+ source->uv_stride, source->uv_width, limits, 8);
+ vpx_post_proc_down_and_across_mb_row(
+ source->v_buffer + 8 * mbr * source->uv_stride,
+ source->v_buffer + 8 * mbr * source->uv_stride, source->uv_stride,
+ source->uv_stride, source->uv_width, limits, 8);
+ }
+ }
+}
+#endif // CONFIG_POSTPROC
+
+#if CONFIG_POSTPROC
+int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
+ vp8_ppflags_t *ppflags) {
+ int q = oci->filter_level * 10 / 6;
+ int flags = ppflags->post_proc_flag;
+ int deblock_level = ppflags->deblocking_level;
+ int noise_level = ppflags->noise_level;
+
+ if (!oci->frame_to_show) return -1;
+
+ if (q > 63) q = 63;
+
+ if (!flags) {
+ *dest = *oci->frame_to_show;
+
+ /* handle problem with extending borders */
+ dest->y_width = oci->Width;
+ dest->y_height = oci->Height;
+ dest->uv_height = dest->y_height / 2;
+ oci->postproc_state.last_base_qindex = oci->base_qindex;
+ oci->postproc_state.last_frame_valid = 1;
+ return 0;
+ }
+ if (flags & VP8D_ADDNOISE) {
+ if (!oci->postproc_state.generated_noise) {
+ oci->postproc_state.generated_noise = vpx_calloc(
+ oci->Width + 256, sizeof(*oci->postproc_state.generated_noise));
+ if (!oci->postproc_state.generated_noise) return 1;
+ }
+ }
+
+ /* Allocate post_proc_buffer_int if needed */
+ if ((flags & VP8D_MFQE) && !oci->post_proc_buffer_int_used) {
+ if ((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) {
+ int width = (oci->Width + 15) & ~15;
+ int height = (oci->Height + 15) & ~15;
+
+ if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer_int, width, height,
+ VP8BORDERINPIXELS)) {
+ vpx_internal_error(&oci->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate MFQE framebuffer");
+ }
+
+ oci->post_proc_buffer_int_used = 1;
+
+ /* insure that postproc is set to all 0's so that post proc
+ * doesn't pull random data in from edge
+ */
+ memset((&oci->post_proc_buffer_int)->buffer_alloc, 128,
+ (&oci->post_proc_buffer)->frame_size);
+ }
+ }
+
+ vpx_clear_system_state();
+
+ if ((flags & VP8D_MFQE) && oci->postproc_state.last_frame_valid &&
+ oci->current_video_frame > 10 &&
+ oci->postproc_state.last_base_qindex < 60 &&
+ oci->base_qindex - oci->postproc_state.last_base_qindex >= 20) {
+ vp8_multiframe_quality_enhance(oci);
+ if (((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) &&
+ oci->post_proc_buffer_int_used) {
+ vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
+ if (flags & VP8D_DEMACROBLOCK) {
+ vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
+ q + (deblock_level - 5) * 10);
+ vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
+ } else if (flags & VP8D_DEBLOCK) {
+ vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, q);
+ }
+ }
+ /* Move partially towards the base q of the previous frame */
+ oci->postproc_state.last_base_qindex =
+ (3 * oci->postproc_state.last_base_qindex + oci->base_qindex) >> 2;
+ } else if (flags & VP8D_DEMACROBLOCK) {
+ vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
+ q + (deblock_level - 5) * 10);
+ vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
+
+ oci->postproc_state.last_base_qindex = oci->base_qindex;
+ } else if (flags & VP8D_DEBLOCK) {
+ vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, q);
+ oci->postproc_state.last_base_qindex = oci->base_qindex;
+ } else {
+ vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
+ oci->postproc_state.last_base_qindex = oci->base_qindex;
+ }
+ oci->postproc_state.last_frame_valid = 1;
+
+ if (flags & VP8D_ADDNOISE) {
+ if (oci->postproc_state.last_q != q ||
+ oci->postproc_state.last_noise != noise_level) {
+ double sigma;
+ struct postproc_state *ppstate = &oci->postproc_state;
+ vpx_clear_system_state();
+ sigma = noise_level + .5 + .6 * q / 63.0;
+ ppstate->clamp =
+ vpx_setup_noise(sigma, ppstate->generated_noise, oci->Width + 256);
+ ppstate->last_q = q;
+ ppstate->last_noise = noise_level;
+ }
+
+ vpx_plane_add_noise(
+ oci->post_proc_buffer.y_buffer, oci->postproc_state.generated_noise,
+ oci->postproc_state.clamp, oci->postproc_state.clamp,
+ oci->post_proc_buffer.y_width, oci->post_proc_buffer.y_height,
+ oci->post_proc_buffer.y_stride);
+ }
+
+ *dest = oci->post_proc_buffer;
+
+ /* handle problem with extending borders */
+ dest->y_width = oci->Width;
+ dest->y_height = oci->Height;
+ dest->uv_height = dest->y_height / 2;
+ return 0;
+}
+#endif
diff --git a/media/libvpx/libvpx/vp8/common/postproc.h b/media/libvpx/libvpx/vp8/common/postproc.h
new file mode 100644
index 0000000000..492c52aef6
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/postproc.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_POSTPROC_H_
+#define VPX_VP8_COMMON_POSTPROC_H_
+
+#include "vpx_ports/mem.h"
+struct postproc_state {
+ int last_q;
+ int last_noise;
+ int last_base_qindex;
+ int last_frame_valid;
+ int clamp;
+ int8_t *generated_noise;
+};
+#include "onyxc_int.h"
+#include "ppflags.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
+ vp8_ppflags_t *ppflags);
+
+void vp8_de_noise(struct VP8Common *cm, YV12_BUFFER_CONFIG *source, int q,
+ int uvfilter);
+
+void vp8_deblock(struct VP8Common *cm, YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *post, int q);
+
+#define MFQE_PRECISION 4
+
+void vp8_multiframe_quality_enhance(struct VP8Common *cm);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_POSTPROC_H_
diff --git a/media/libvpx/libvpx/vp8/common/ppflags.h b/media/libvpx/libvpx/vp8/common/ppflags.h
new file mode 100644
index 0000000000..bdf08734b9
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/ppflags.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_PPFLAGS_H_
+#define VPX_VP8_COMMON_PPFLAGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+enum {
+ VP8D_NOFILTERING = 0,
+ VP8D_DEBLOCK = 1 << 0,
+ VP8D_DEMACROBLOCK = 1 << 1,
+ VP8D_ADDNOISE = 1 << 2,
+ VP8D_MFQE = 1 << 3
+};
+
+typedef struct {
+ int post_proc_flag;
+ int deblocking_level;
+ int noise_level;
+ int display_ref_frame_flag;
+ int display_mb_modes_flag;
+ int display_b_modes_flag;
+ int display_mv_flag;
+} vp8_ppflags_t;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_PPFLAGS_H_
diff --git a/media/libvpx/libvpx/vp8/common/quant_common.c b/media/libvpx/libvpx/vp8/common/quant_common.c
new file mode 100644
index 0000000000..e290eec92b
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/quant_common.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "quant_common.h"
+
+static const int dc_qlookup[QINDEX_RANGE] = {
+ 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17,
+ 17, 18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26,
+ 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+ 70, 71, 72, 73, 74, 75, 76, 76, 77, 78, 79, 80, 81, 82, 83,
+ 84, 85, 86, 87, 88, 89, 91, 93, 95, 96, 98, 100, 101, 102, 104,
+ 106, 108, 110, 112, 114, 116, 118, 122, 124, 126, 128, 130, 132, 134, 136,
+ 138, 140, 143, 145, 148, 151, 154, 157,
+};
+
+static const int ac_qlookup[QINDEX_RANGE] = {
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
+ 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+ 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 62, 64, 66, 68,
+ 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98,
+ 100, 102, 104, 106, 108, 110, 112, 114, 116, 119, 122, 125, 128, 131, 134,
+ 137, 140, 143, 146, 149, 152, 155, 158, 161, 164, 167, 170, 173, 177, 181,
+ 185, 189, 193, 197, 201, 205, 209, 213, 217, 221, 225, 229, 234, 239, 245,
+ 249, 254, 259, 264, 269, 274, 279, 284,
+};
+
+int vp8_dc_quant(int QIndex, int Delta) {
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127) {
+ QIndex = 127;
+ } else if (QIndex < 0) {
+ QIndex = 0;
+ }
+
+ retval = dc_qlookup[QIndex];
+ return retval;
+}
+
+int vp8_dc2quant(int QIndex, int Delta) {
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127) {
+ QIndex = 127;
+ } else if (QIndex < 0) {
+ QIndex = 0;
+ }
+
+ retval = dc_qlookup[QIndex] * 2;
+ return retval;
+}
+int vp8_dc_uv_quant(int QIndex, int Delta) {
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127) {
+ QIndex = 127;
+ } else if (QIndex < 0) {
+ QIndex = 0;
+ }
+
+ retval = dc_qlookup[QIndex];
+
+ if (retval > 132) retval = 132;
+
+ return retval;
+}
+
+int vp8_ac_yquant(int QIndex) {
+ int retval;
+
+ if (QIndex > 127) {
+ QIndex = 127;
+ } else if (QIndex < 0) {
+ QIndex = 0;
+ }
+
+ retval = ac_qlookup[QIndex];
+ return retval;
+}
+
+int vp8_ac2quant(int QIndex, int Delta) {
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127) {
+ QIndex = 127;
+ } else if (QIndex < 0) {
+ QIndex = 0;
+ }
+
+ /* For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+ * The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+ * word size. */
+ retval = (ac_qlookup[QIndex] * 101581) >> 16;
+
+ if (retval < 8) retval = 8;
+
+ return retval;
+}
+int vp8_ac_uv_quant(int QIndex, int Delta) {
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127) {
+ QIndex = 127;
+ } else if (QIndex < 0) {
+ QIndex = 0;
+ }
+
+ retval = ac_qlookup[QIndex];
+ return retval;
+}
diff --git a/media/libvpx/libvpx/vp8/common/quant_common.h b/media/libvpx/libvpx/vp8/common/quant_common.h
new file mode 100644
index 0000000000..049840a272
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/quant_common.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_QUANT_COMMON_H_
+#define VPX_VP8_COMMON_QUANT_COMMON_H_
+
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int vp8_ac_yquant(int QIndex);
+extern int vp8_dc_quant(int QIndex, int Delta);
+extern int vp8_dc2quant(int QIndex, int Delta);
+extern int vp8_ac2quant(int QIndex, int Delta);
+extern int vp8_dc_uv_quant(int QIndex, int Delta);
+extern int vp8_ac_uv_quant(int QIndex, int Delta);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_QUANT_COMMON_H_
diff --git a/media/libvpx/libvpx/vp8/common/reconinter.c b/media/libvpx/libvpx/vp8/common/reconinter.c
new file mode 100644
index 0000000000..2cb0709318
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/reconinter.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <string.h>
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "blockd.h"
+#include "reconinter.h"
+#if CONFIG_RUNTIME_CPU_DETECT
+#include "onyxc_int.h"
+#endif
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst,
+ int dst_stride) {
+ int r;
+
+ for (r = 0; r < 16; ++r) {
+ memcpy(dst, src, 16);
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst,
+ int dst_stride) {
+ int r;
+
+ for (r = 0; r < 8; ++r) {
+ memcpy(dst, src, 8);
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst,
+ int dst_stride) {
+ int r;
+
+ for (r = 0; r < 4; ++r) {
+ memcpy(dst, src, 8);
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre,
+ int pre_stride, vp8_subpix_fn_t sppf) {
+ int r;
+ unsigned char *pred_ptr = d->predictor;
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride +
+ (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) {
+ sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7,
+ pred_ptr, pitch);
+ } else {
+ for (r = 0; r < 4; ++r) {
+ pred_ptr[0] = ptr[0];
+ pred_ptr[1] = ptr[1];
+ pred_ptr[2] = ptr[2];
+ pred_ptr[3] = ptr[3];
+ pred_ptr += pitch;
+ ptr += pre_stride;
+ }
+ }
+}
+
+static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d,
+ unsigned char *dst, int dst_stride,
+ unsigned char *base_pre, int pre_stride) {
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride +
+ (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) {
+ x->subpixel_predict8x8(ptr, pre_stride, d->bmi.mv.as_mv.col & 7,
+ d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+ } else {
+ vp8_copy_mem8x8(ptr, pre_stride, dst, dst_stride);
+ }
+}
+
+static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d,
+ unsigned char *dst, int dst_stride,
+ unsigned char *base_pre, int pre_stride) {
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride +
+ (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) {
+ x->subpixel_predict8x4(ptr, pre_stride, d->bmi.mv.as_mv.col & 7,
+ d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+ } else {
+ vp8_copy_mem8x4(ptr, pre_stride, dst, dst_stride);
+ }
+}
+
+static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst,
+ int dst_stride, unsigned char *base_pre,
+ int pre_stride, vp8_subpix_fn_t sppf) {
+ int r;
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride +
+ (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) {
+ sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst,
+ dst_stride);
+ } else {
+ for (r = 0; r < 4; ++r) {
+ dst[0] = ptr[0];
+ dst[1] = ptr[1];
+ dst[2] = ptr[2];
+ dst[3] = ptr[3];
+ dst += dst_stride;
+ ptr += pre_stride;
+ }
+ }
+}
+
+/*encoder only*/
+void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x) {
+ unsigned char *uptr, *vptr;
+ unsigned char *upred_ptr = &x->predictor[256];
+ unsigned char *vpred_ptr = &x->predictor[320];
+
+ int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+ int offset;
+ int pre_stride = x->pre.uv_stride;
+
+ /* calc uv motion vectors */
+ mv_row += 1 | (mv_row >> (sizeof(int) * CHAR_BIT - 1));
+ mv_col += 1 | (mv_col >> (sizeof(int) * CHAR_BIT - 1));
+ mv_row /= 2;
+ mv_col /= 2;
+ mv_row &= x->fullpixel_mask;
+ mv_col &= x->fullpixel_mask;
+
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ uptr = x->pre.u_buffer + offset;
+ vptr = x->pre.v_buffer + offset;
+
+ if ((mv_row | mv_col) & 7) {
+ x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr,
+ 8);
+ x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr,
+ 8);
+ } else {
+ vp8_copy_mem8x8(uptr, pre_stride, upred_ptr, 8);
+ vp8_copy_mem8x8(vptr, pre_stride, vpred_ptr, 8);
+ }
+}
+
+/*encoder only*/
+void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x) {
+ int i, j;
+ int pre_stride = x->pre.uv_stride;
+ unsigned char *base_pre;
+
+ /* build uv mvs */
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < 2; ++j) {
+ int yoffset = i * 8 + j * 2;
+ int uoffset = 16 + i * 2 + j;
+ int voffset = 20 + i * 2 + j;
+
+ int temp;
+
+ temp = x->block[yoffset].bmi.mv.as_mv.row +
+ x->block[yoffset + 1].bmi.mv.as_mv.row +
+ x->block[yoffset + 4].bmi.mv.as_mv.row +
+ x->block[yoffset + 5].bmi.mv.as_mv.row;
+
+ temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+ x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+ temp = x->block[yoffset].bmi.mv.as_mv.col +
+ x->block[yoffset + 1].bmi.mv.as_mv.col +
+ x->block[yoffset + 4].bmi.mv.as_mv.col +
+ x->block[yoffset + 5].bmi.mv.as_mv.col;
+
+ temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+ x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+ x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
+ }
+ }
+
+ base_pre = x->pre.u_buffer;
+ for (i = 16; i < 20; i += 2) {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i + 1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) {
+ build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
+ } else {
+ vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride,
+ x->subpixel_predict);
+ vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride,
+ x->subpixel_predict);
+ }
+ }
+
+ base_pre = x->pre.v_buffer;
+ for (i = 20; i < 24; i += 2) {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i + 1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) {
+ build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
+ } else {
+ vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride,
+ x->subpixel_predict);
+ vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride,
+ x->subpixel_predict);
+ }
+ }
+}
+
+/*encoder only*/
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, unsigned char *dst_y,
+ int dst_ystride) {
+ unsigned char *ptr_base;
+ unsigned char *ptr;
+ int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+ int pre_stride = x->pre.y_stride;
+
+ ptr_base = x->pre.y_buffer;
+ ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+ if ((mv_row | mv_col) & 7) {
+ x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_y,
+ dst_ystride);
+ } else {
+ vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+ }
+}
+
+static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
+ /* If the MV points so far into the UMV border that no visible pixels
+ * are used for reconstruction, the subpel part of the MV can be
+ * discarded and the MV limited to 16 pixels with equivalent results.
+ *
+ * This limit kicks in at 19 pixels for the top and left edges, for
+ * the 16 pixels plus 3 taps right of the central pixel when subpel
+ * filtering. The bottom and right edges use 16 pixels plus 2 pixels
+ * left of the central pixel when filtering.
+ */
+ if (mv->col < (xd->mb_to_left_edge - (19 << 3))) {
+ mv->col = xd->mb_to_left_edge - (16 << 3);
+ } else if (mv->col > xd->mb_to_right_edge + (18 << 3)) {
+ mv->col = xd->mb_to_right_edge + (16 << 3);
+ }
+
+ if (mv->row < (xd->mb_to_top_edge - (19 << 3))) {
+ mv->row = xd->mb_to_top_edge - (16 << 3);
+ } else if (mv->row > xd->mb_to_bottom_edge + (18 << 3)) {
+ mv->row = xd->mb_to_bottom_edge + (16 << 3);
+ }
+}
+
+/* A version of the above function for chroma block MVs.*/
+static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
+ mv->col = (2 * mv->col < (xd->mb_to_left_edge - (19 << 3)))
+ ? (xd->mb_to_left_edge - (16 << 3)) >> 1
+ : mv->col;
+ mv->col = (2 * mv->col > xd->mb_to_right_edge + (18 << 3))
+ ? (xd->mb_to_right_edge + (16 << 3)) >> 1
+ : mv->col;
+
+ mv->row = (2 * mv->row < (xd->mb_to_top_edge - (19 << 3)))
+ ? (xd->mb_to_top_edge - (16 << 3)) >> 1
+ : mv->row;
+ mv->row = (2 * mv->row > xd->mb_to_bottom_edge + (18 << 3))
+ ? (xd->mb_to_bottom_edge + (16 << 3)) >> 1
+ : mv->row;
+}
+
+void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v, int dst_ystride,
+ int dst_uvstride) {
+ int offset;
+ unsigned char *ptr;
+ unsigned char *uptr, *vptr;
+
+ int_mv _16x16mv;
+
+ unsigned char *ptr_base = x->pre.y_buffer;
+ int pre_stride = x->pre.y_stride;
+
+ _16x16mv.as_int = x->mode_info_context->mbmi.mv.as_int;
+
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs) {
+ clamp_mv_to_umv_border(&_16x16mv.as_mv, x);
+ }
+
+ ptr = ptr_base + (_16x16mv.as_mv.row >> 3) * pre_stride +
+ (_16x16mv.as_mv.col >> 3);
+
+ if (_16x16mv.as_int & 0x00070007) {
+ x->subpixel_predict16x16(ptr, pre_stride, _16x16mv.as_mv.col & 7,
+ _16x16mv.as_mv.row & 7, dst_y, dst_ystride);
+ } else {
+ vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+ }
+
+ /* calc uv motion vectors */
+ _16x16mv.as_mv.row +=
+ 1 | (_16x16mv.as_mv.row >> (sizeof(int) * CHAR_BIT - 1));
+ _16x16mv.as_mv.col +=
+ 1 | (_16x16mv.as_mv.col >> (sizeof(int) * CHAR_BIT - 1));
+ _16x16mv.as_mv.row /= 2;
+ _16x16mv.as_mv.col /= 2;
+ _16x16mv.as_mv.row &= x->fullpixel_mask;
+ _16x16mv.as_mv.col &= x->fullpixel_mask;
+
+ if (2 * _16x16mv.as_mv.col < (x->mb_to_left_edge - (19 << 3)) ||
+ 2 * _16x16mv.as_mv.col > x->mb_to_right_edge + (18 << 3) ||
+ 2 * _16x16mv.as_mv.row < (x->mb_to_top_edge - (19 << 3)) ||
+ 2 * _16x16mv.as_mv.row > x->mb_to_bottom_edge + (18 << 3)) {
+ return;
+ }
+
+ pre_stride >>= 1;
+ offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
+ uptr = x->pre.u_buffer + offset;
+ vptr = x->pre.v_buffer + offset;
+
+ if (_16x16mv.as_int & 0x00070007) {
+ x->subpixel_predict8x8(uptr, pre_stride, _16x16mv.as_mv.col & 7,
+ _16x16mv.as_mv.row & 7, dst_u, dst_uvstride);
+ x->subpixel_predict8x8(vptr, pre_stride, _16x16mv.as_mv.col & 7,
+ _16x16mv.as_mv.row & 7, dst_v, dst_uvstride);
+ } else {
+ vp8_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
+ vp8_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
+ }
+}
+
+static void build_inter4x4_predictors_mb(MACROBLOCKD *x) {
+ int i;
+ unsigned char *base_dst = x->dst.y_buffer;
+ unsigned char *base_pre = x->pre.y_buffer;
+
+ if (x->mode_info_context->mbmi.partitioning < 3) {
+ BLOCKD *b;
+ int dst_stride = x->dst.y_stride;
+
+ x->block[0].bmi = x->mode_info_context->bmi[0];
+ x->block[2].bmi = x->mode_info_context->bmi[2];
+ x->block[8].bmi = x->mode_info_context->bmi[8];
+ x->block[10].bmi = x->mode_info_context->bmi[10];
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs) {
+ clamp_mv_to_umv_border(&x->block[0].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[2].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[8].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x);
+ }
+
+ b = &x->block[0];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre,
+ dst_stride);
+ b = &x->block[2];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre,
+ dst_stride);
+ b = &x->block[8];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre,
+ dst_stride);
+ b = &x->block[10];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre,
+ dst_stride);
+ } else {
+ for (i = 0; i < 16; i += 2) {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i + 1];
+ int dst_stride = x->dst.y_stride;
+
+ x->block[i + 0].bmi = x->mode_info_context->bmi[i + 0];
+ x->block[i + 1].bmi = x->mode_info_context->bmi[i + 1];
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs) {
+ clamp_mv_to_umv_border(&x->block[i + 0].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[i + 1].bmi.mv.as_mv, x);
+ }
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) {
+ build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride,
+ base_pre, dst_stride);
+ } else {
+ build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride,
+ base_pre, dst_stride, x->subpixel_predict);
+ build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride,
+ base_pre, dst_stride, x->subpixel_predict);
+ }
+ }
+ }
+ base_dst = x->dst.u_buffer;
+ base_pre = x->pre.u_buffer;
+ for (i = 16; i < 20; i += 2) {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i + 1];
+ int dst_stride = x->dst.uv_stride;
+
+ /* Note: uv mvs already clamped in build_4x4uvmvs() */
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) {
+ build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride,
+ base_pre, dst_stride);
+ } else {
+ build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre,
+ dst_stride, x->subpixel_predict);
+ build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre,
+ dst_stride, x->subpixel_predict);
+ }
+ }
+
+ base_dst = x->dst.v_buffer;
+ base_pre = x->pre.v_buffer;
+ for (i = 20; i < 24; i += 2) {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i + 1];
+ int dst_stride = x->dst.uv_stride;
+
+ /* Note: uv mvs already clamped in build_4x4uvmvs() */
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) {
+ build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride,
+ base_pre, dst_stride);
+ } else {
+ build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre,
+ dst_stride, x->subpixel_predict);
+ build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre,
+ dst_stride, x->subpixel_predict);
+ }
+ }
+}
+
+static void build_4x4uvmvs(MACROBLOCKD *x) {
+ int i, j;
+
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < 2; ++j) {
+ int yoffset = i * 8 + j * 2;
+ int uoffset = 16 + i * 2 + j;
+ int voffset = 20 + i * 2 + j;
+
+ int temp;
+
+ temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.row +
+ x->mode_info_context->bmi[yoffset + 1].mv.as_mv.row +
+ x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row +
+ x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row;
+
+ temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+ x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+ temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.col +
+ x->mode_info_context->bmi[yoffset + 1].mv.as_mv.col +
+ x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col +
+ x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col;
+
+ temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+ x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs) {
+ clamp_uvmv_to_umv_border(&x->block[uoffset].bmi.mv.as_mv, x);
+ }
+
+ x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
+ }
+ }
+}
+
+void vp8_build_inter_predictors_mb(MACROBLOCKD *xd) {
+ if (xd->mode_info_context->mbmi.mode != SPLITMV) {
+ vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer, xd->dst.u_buffer,
+ xd->dst.v_buffer, xd->dst.y_stride,
+ xd->dst.uv_stride);
+ } else {
+ build_4x4uvmvs(xd);
+ build_inter4x4_predictors_mb(xd);
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/reconinter.h b/media/libvpx/libvpx/vp8/common/reconinter.h
new file mode 100644
index 0000000000..974e7ce754
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/reconinter.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_RECONINTER_H_
+#define VPX_VP8_COMMON_RECONINTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_build_inter_predictors_mb(MACROBLOCKD *xd);
+void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v, int dst_ystride,
+ int dst_uvstride);
+
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, unsigned char *dst_y,
+ int dst_ystride);
+void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre,
+ int pre_stride, vp8_subpix_fn_t sppf);
+
+void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
+void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_RECONINTER_H_
diff --git a/media/libvpx/libvpx/vp8/common/reconintra.c b/media/libvpx/libvpx/vp8/common/reconintra.c
new file mode 100644
index 0000000000..8e2094da87
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/reconintra.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
+#include "blockd.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
+
+enum {
+ SIZE_16,
+ SIZE_8,
+ NUM_SIZES,
+};
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[4][NUM_SIZES];
+static intra_pred_fn dc_pred[2][2][NUM_SIZES];
+
+static void vp8_init_intra_predictors_internal(void) {
+#define INIT_SIZE(sz) \
+ pred[V_PRED][SIZE_##sz] = vpx_v_predictor_##sz##x##sz; \
+ pred[H_PRED][SIZE_##sz] = vpx_h_predictor_##sz##x##sz; \
+ pred[TM_PRED][SIZE_##sz] = vpx_tm_predictor_##sz##x##sz; \
+ \
+ dc_pred[0][0][SIZE_##sz] = vpx_dc_128_predictor_##sz##x##sz; \
+ dc_pred[0][1][SIZE_##sz] = vpx_dc_top_predictor_##sz##x##sz; \
+ dc_pred[1][0][SIZE_##sz] = vpx_dc_left_predictor_##sz##x##sz; \
+ dc_pred[1][1][SIZE_##sz] = vpx_dc_predictor_##sz##x##sz
+
+ INIT_SIZE(16);
+ INIT_SIZE(8);
+ vp8_init_intra4x4_predictors_internal();
+}
+
+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x, unsigned char *yabove_row,
+ unsigned char *yleft, int left_stride,
+ unsigned char *ypred_ptr, int y_stride) {
+ MB_PREDICTION_MODE mode = x->mode_info_context->mbmi.mode;
+ DECLARE_ALIGNED(16, uint8_t, yleft_col[16]);
+ int i;
+ intra_pred_fn fn;
+
+ for (i = 0; i < 16; ++i) {
+ yleft_col[i] = yleft[i * left_stride];
+ }
+
+ if (mode == DC_PRED) {
+ fn = dc_pred[x->left_available][x->up_available][SIZE_16];
+ } else {
+ fn = pred[mode][SIZE_16];
+ }
+
+ fn(ypred_ptr, y_stride, yabove_row, yleft_col);
+}
+
+void vp8_build_intra_predictors_mbuv_s(
+ MACROBLOCKD *x, unsigned char *uabove_row, unsigned char *vabove_row,
+ unsigned char *uleft, unsigned char *vleft, int left_stride,
+ unsigned char *upred_ptr, unsigned char *vpred_ptr, int pred_stride) {
+ MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode;
+#if HAVE_VSX
+ /* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from
+ uleft_col and vleft_col. Play it safe by reserving enough stack
+ space here. */
+ unsigned char uleft_col[16];
+ unsigned char vleft_col[16];
+#else
+ unsigned char uleft_col[8];
+ unsigned char vleft_col[8];
+#endif
+ int i;
+ intra_pred_fn fn;
+
+ for (i = 0; i < 8; ++i) {
+ uleft_col[i] = uleft[i * left_stride];
+ vleft_col[i] = vleft[i * left_stride];
+ }
+
+ if (uvmode == DC_PRED) {
+ fn = dc_pred[x->left_available][x->up_available][SIZE_8];
+ } else {
+ fn = pred[uvmode][SIZE_8];
+ }
+
+ fn(upred_ptr, pred_stride, uabove_row, uleft_col);
+ fn(vpred_ptr, pred_stride, vabove_row, vleft_col);
+}
+
+void vp8_init_intra_predictors(void) {
+ once(vp8_init_intra_predictors_internal);
+}
diff --git a/media/libvpx/libvpx/vp8/common/reconintra.h b/media/libvpx/libvpx/vp8/common/reconintra.h
new file mode 100644
index 0000000000..029ac00a24
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/reconintra.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_RECONINTRA_H_
+#define VPX_VP8_COMMON_RECONINTRA_H_
+
+#include "vp8/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x, unsigned char *yabove_row,
+ unsigned char *yleft, int left_stride,
+ unsigned char *ypred_ptr, int y_stride);
+
+void vp8_build_intra_predictors_mbuv_s(
+ MACROBLOCKD *x, unsigned char *uabove_row, unsigned char *vabove_row,
+ unsigned char *uleft, unsigned char *vleft, int left_stride,
+ unsigned char *upred_ptr, unsigned char *vpred_ptr, int pred_stride);
+
+void vp8_init_intra_predictors(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_RECONINTRA_H_
diff --git a/media/libvpx/libvpx/vp8/common/reconintra4x4.c b/media/libvpx/libvpx/vp8/common/reconintra4x4.c
new file mode 100644
index 0000000000..be936df5e0
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/reconintra4x4.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp8_rtcd.h"
+#include "blockd.h"
+#include "reconintra4x4.h"
+#include "vp8/common/common.h"
+#include "vpx_ports/compiler_attributes.h"
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[10];
+
+void vp8_init_intra4x4_predictors_internal(void) {
+ pred[B_DC_PRED] = vpx_dc_predictor_4x4;
+ pred[B_TM_PRED] = vpx_tm_predictor_4x4;
+ pred[B_VE_PRED] = vpx_ve_predictor_4x4;
+ pred[B_HE_PRED] = vpx_he_predictor_4x4;
+ pred[B_LD_PRED] = vpx_d45e_predictor_4x4;
+ pred[B_RD_PRED] = vpx_d135_predictor_4x4;
+ pred[B_VR_PRED] = vpx_d117_predictor_4x4;
+ pred[B_VL_PRED] = vpx_d63e_predictor_4x4;
+ pred[B_HD_PRED] = vpx_d153_predictor_4x4;
+ pred[B_HU_PRED] = vpx_d207_predictor_4x4;
+}
+
+void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
+ int left_stride, B_PREDICTION_MODE b_mode,
+ unsigned char *dst, int dst_stride,
+ unsigned char top_left) {
+/* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from
+ Above (aka, Aboveb + 4). Play it safe by reserving enough stack
+ space here. Similary for "Left". */
+#if HAVE_VSX
+ unsigned char Aboveb[20];
+#else
+ unsigned char Aboveb[12];
+#endif
+ unsigned char *Above = Aboveb + 4;
+#if HAVE_NEON
+ // Neon intrinsics are unable to load 32 bits, or 4 8 bit values. Instead, it
+ // over reads but does not use the extra 4 values.
+ unsigned char Left[8];
+#if VPX_WITH_ASAN
+ // Silence an 'uninitialized read' warning. Although uninitialized values are
+ // indeed read, they are not used.
+ vp8_zero_array(Left, 8);
+#endif // VPX_WITH_ASAN
+#elif HAVE_VSX
+ unsigned char Left[16];
+#else
+ unsigned char Left[4];
+#endif // HAVE_NEON
+
+ Left[0] = yleft[0];
+ Left[1] = yleft[left_stride];
+ Left[2] = yleft[2 * left_stride];
+ Left[3] = yleft[3 * left_stride];
+ memcpy(Above, above, 8);
+ Above[-1] = top_left;
+
+ pred[b_mode](dst, dst_stride, Above, Left);
+}
diff --git a/media/libvpx/libvpx/vp8/common/reconintra4x4.h b/media/libvpx/libvpx/vp8/common/reconintra4x4.h
new file mode 100644
index 0000000000..3618ec5cbe
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/reconintra4x4.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_RECONINTRA4X4_H_
+#define VPX_VP8_COMMON_RECONINTRA4X4_H_
+#include "vp8/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd,
+ unsigned char *above_right_src) {
+ int dst_stride = xd->dst.y_stride;
+ unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16;
+
+ unsigned int *src_ptr = (unsigned int *)above_right_src;
+ unsigned int *dst_ptr0 = (unsigned int *)(above_right_dst + 4 * dst_stride);
+ unsigned int *dst_ptr1 = (unsigned int *)(above_right_dst + 8 * dst_stride);
+ unsigned int *dst_ptr2 = (unsigned int *)(above_right_dst + 12 * dst_stride);
+
+ *dst_ptr0 = *src_ptr;
+ *dst_ptr1 = *src_ptr;
+ *dst_ptr2 = *src_ptr;
+}
+
+void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
+ int left_stride, B_PREDICTION_MODE b_mode,
+ unsigned char *dst, int dst_stride,
+ unsigned char top_left);
+
+void vp8_init_intra4x4_predictors_internal(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_RECONINTRA4X4_H_
diff --git a/media/libvpx/libvpx/vp8/common/rtcd.c b/media/libvpx/libvpx/vp8/common/rtcd.c
new file mode 100644
index 0000000000..09a0e2b4b3
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/rtcd.c
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vp8_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+void vp8_rtcd() { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vp8/common/rtcd_defs.pl b/media/libvpx/libvpx/vp8/common/rtcd_defs.pl
new file mode 100644
index 0000000000..739a612847
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/rtcd_defs.pl
@@ -0,0 +1,250 @@
+##
+## Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+sub vp8_common_forward_decls() {
+print <<EOF
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/vp8_common_forward_decls/;
+
+#
+# Dequant
+#
+add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *DQC";
+specialize qw/vp8_dequantize_b mmx neon msa mmi/;
+
+add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *dest, int stride";
+specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/;
+
+add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
+specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi lsx/;
+
+add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
+specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi lsx/;
+
+#
+# Loopfilter
+#
+add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi lsx/;
+
+add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi lsx/;
+
+add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi lsx/;
+
+add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi lsx/;
+
+
+add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa mmi/;
+$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
+$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
+$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
+$vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
+$vp8_loop_filter_simple_mbv_mmi=vp8_loop_filter_simple_vertical_edge_mmi;
+
+add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa mmi/;
+$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
+$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
+$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
+$vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
+$vp8_loop_filter_simple_mbh_mmi=vp8_loop_filter_simple_horizontal_edge_mmi;
+
+add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_bv sse2 neon msa mmi/;
+$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
+$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
+$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
+$vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
+$vp8_loop_filter_simple_bv_mmi=vp8_loop_filter_bvs_mmi;
+
+add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_bh sse2 neon msa mmi/;
+$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
+$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
+$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
+$vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
+$vp8_loop_filter_simple_bh_mmi=vp8_loop_filter_bhs_mmi;
+
+#
+# IDCT
+#
+#idct16
+add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride";
+specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa mmi/;
+
+#iwalsh1
+add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *mb_dqcoeff";
+specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
+
+#iwalsh16
+add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *mb_dqcoeff";
+specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa mmi/;
+
+#idct1_scalar_add
+add_proto qw/void vp8_dc_only_idct_add/, "short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride";
+specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi lsx/;
+
+#
+# RECON
+#
+add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
+specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi/;
+
+add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
+specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi/;
+
+add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
+specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/;
+
+#
+# Postproc
+#
+if (vpx_config("CONFIG_POSTPROC") eq "yes") {
+
+ add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
+
+ add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
+
+ add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
+
+ add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+ specialize qw/vp8_filter_by_weight16x16 sse2 msa/;
+
+ add_proto qw/void vp8_filter_by_weight8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+ specialize qw/vp8_filter_by_weight8x8 sse2 msa/;
+
+ add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+}
+
+#
+# Subpixel
+#
+add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi lsx/;
+
+add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi lsx/;
+
+add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/;
+
+add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi lsx/;
+
+add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/;
+
+add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_bilinear_predict8x8 sse2 ssse3 neon msa/;
+
+add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_bilinear_predict8x4 sse2 neon msa/;
+
+add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_bilinear_predict4x4 sse2 neon msa/;
+
+#
+# Encoder functions below this point.
+#
+if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
+
+#
+# Block copy
+#
+add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height";
+specialize qw/vp8_copy32xn sse2 sse3/;
+
+#
+# Forward DCT
+#
+add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi lsx/;
+
+add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi lsx/;
+
+add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/;
+
+#
+# Quantizer
+#
+add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi lsx/;
+
+add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/;
+
+#
+# Block subtraction
+#
+add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
+specialize qw/vp8_block_error sse2 msa lsx/;
+
+add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
+specialize qw/vp8_mbblock_error sse2 msa lsx/;
+
+add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
+specialize qw/vp8_mbuverror sse2 msa/;
+
+#
+# Motion search
+#
+add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+specialize qw/vp8_refining_search_sad sse2 msa/;
+$vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
+$vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
+
+add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+specialize qw/vp8_diamond_search_sad sse2 msa lsx/;
+$vp8_diamond_search_sad_sse2=vp8_diamond_search_sadx4;
+$vp8_diamond_search_sad_msa=vp8_diamond_search_sadx4;
+$vp8_diamond_search_sad_lsx=vp8_diamond_search_sadx4;
+
+#
+# Alt-ref Noise Reduction (ARNR)
+#
+if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ add_proto qw/void vp8_temporal_filter_apply/, "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count";
+ specialize qw/vp8_temporal_filter_apply sse2 msa/;
+}
+
+#
+# Denoiser filter
+#
+if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
+ add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
+ specialize qw/vp8_denoiser_filter sse2 neon msa/;
+ add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
+ specialize qw/vp8_denoiser_filter_uv sse2 neon msa/;
+}
+
+# End of encoder only functions
+}
+1;
diff --git a/media/libvpx/libvpx/vp8/common/setupintrarecon.c b/media/libvpx/libvpx/vp8/common/setupintrarecon.c
new file mode 100644
index 0000000000..dc8a8aae96
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/setupintrarecon.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "setupintrarecon.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {
+ int i;
+
+ /* set up frame new frame for intra coded blocks */
+ memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+ for (i = 0; i < ybf->y_height; ++i) {
+ ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char)129;
+ }
+
+ memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ for (i = 0; i < ybf->uv_height; ++i) {
+ ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char)129;
+ }
+
+ memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ for (i = 0; i < ybf->uv_height; ++i) {
+ ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char)129;
+ }
+}
+
+void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf) {
+ memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+ memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+}
diff --git a/media/libvpx/libvpx/vp8/common/setupintrarecon.h b/media/libvpx/libvpx/vp8/common/setupintrarecon.h
new file mode 100644
index 0000000000..903a536aed
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/setupintrarecon.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_SETUPINTRARECON_H_
+#define VPX_VP8_COMMON_SETUPINTRARECON_H_
+
+#include "./vpx_config.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf);
+
+static INLINE void setup_intra_recon_left(unsigned char *y_buffer,
+ unsigned char *u_buffer,
+ unsigned char *v_buffer, int y_stride,
+ int uv_stride) {
+ int i;
+
+ for (i = 0; i < 16; ++i) y_buffer[y_stride * i] = (unsigned char)129;
+
+ for (i = 0; i < 8; ++i) u_buffer[uv_stride * i] = (unsigned char)129;
+
+ for (i = 0; i < 8; ++i) v_buffer[uv_stride * i] = (unsigned char)129;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_SETUPINTRARECON_H_
diff --git a/media/libvpx/libvpx/vp8/common/swapyv12buffer.c b/media/libvpx/libvpx/vp8/common/swapyv12buffer.c
new file mode 100644
index 0000000000..5ff21e94a8
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/swapyv12buffer.c
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "swapyv12buffer.h"
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *last_frame) {
+ unsigned char *temp;
+
+ temp = last_frame->buffer_alloc;
+ last_frame->buffer_alloc = new_frame->buffer_alloc;
+ new_frame->buffer_alloc = temp;
+
+ temp = last_frame->y_buffer;
+ last_frame->y_buffer = new_frame->y_buffer;
+ new_frame->y_buffer = temp;
+
+ temp = last_frame->u_buffer;
+ last_frame->u_buffer = new_frame->u_buffer;
+ new_frame->u_buffer = temp;
+
+ temp = last_frame->v_buffer;
+ last_frame->v_buffer = new_frame->v_buffer;
+ new_frame->v_buffer = temp;
+}
diff --git a/media/libvpx/libvpx/vp8/common/swapyv12buffer.h b/media/libvpx/libvpx/vp8/common/swapyv12buffer.h
new file mode 100644
index 0000000000..e37c471f63
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/swapyv12buffer.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_SWAPYV12BUFFER_H_
+#define VPX_VP8_COMMON_SWAPYV12BUFFER_H_
+
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *last_frame);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_SWAPYV12BUFFER_H_
diff --git a/media/libvpx/libvpx/vp8/common/systemdependent.h b/media/libvpx/libvpx/vp8/common/systemdependent.h
new file mode 100644
index 0000000000..83a5513aae
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/systemdependent.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_SYSTEMDEPENDENT_H_
+#define VPX_VP8_COMMON_SYSTEMDEPENDENT_H_
+
+#include "vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8Common;
+void vp8_machine_specific_config(struct VP8Common *);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_SYSTEMDEPENDENT_H_
diff --git a/media/libvpx/libvpx/vp8/common/threading.h b/media/libvpx/libvpx/vp8/common/threading.h
new file mode 100644
index 0000000000..1cfb9fec51
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/threading.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_THREADING_H_
+#define VPX_VP8_COMMON_THREADING_H_
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
+/* Thread management macros */
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+/* Win32 */
+#include <process.h>
+#include <windows.h>
+#if defined(__GNUC__) && \
+ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
+#define THREAD_FUNCTION \
+ __attribute__((force_align_arg_pointer)) unsigned int __stdcall
+#else
+#define THREAD_FUNCTION unsigned int __stdcall
+#endif
+#define THREAD_FUNCTION_RETURN DWORD
+#define THREAD_SPECIFIC_INDEX DWORD
+#define pthread_t HANDLE
+#define pthread_attr_t DWORD
+#define pthread_detach(thread) \
+ if (thread != NULL) CloseHandle(thread)
+#define thread_sleep(nms) Sleep(nms)
+#define pthread_cancel(thread) terminate_thread(thread, 0)
+#define ts_key_create(ts_key, destructor) \
+ { ts_key = TlsAlloc(); };
+#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
+#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
+#define pthread_self() GetCurrentThreadId()
+
+#elif defined(__OS2__)
+/* OS/2 */
+#define INCL_DOS
+#include <os2.h>
+
+#include <stdlib.h>
+#define THREAD_FUNCTION void *
+#define THREAD_FUNCTION_RETURN void *
+#define THREAD_SPECIFIC_INDEX PULONG
+#define pthread_t TID
+#define pthread_attr_t ULONG
+#define pthread_detach(thread) 0
+#define thread_sleep(nms) DosSleep(nms)
+#define pthread_cancel(thread) DosKillThread(thread)
+#define ts_key_create(ts_key, destructor) \
+ DosAllocThreadLocalMemory(1, &(ts_key));
+#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
+#define pthread_setspecific(ts_key, value) (*(ts_key) = (ULONG)(value))
+#define pthread_self() _gettid()
+#else
+#ifdef __APPLE__
+#include <mach/mach_init.h>
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <time.h>
+#include <unistd.h>
+
+#else
+#include <semaphore.h>
+#endif
+
+#include <pthread.h>
+/* pthreads */
+/* Nearly everything is already defined */
+#define THREAD_FUNCTION void *
+#define THREAD_FUNCTION_RETURN void *
+#define THREAD_SPECIFIC_INDEX pthread_key_t
+#define ts_key_create(ts_key, destructor) \
+ pthread_key_create(&(ts_key), destructor);
+#endif
+
+/* Synchronization macros: Win32 and Pthreads */
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+#define sem_t HANDLE
+#define pause(voidpara) __asm PAUSE
+#define sem_init(sem, sem_attr1, sem_init_value) \
+ (int)((*sem = CreateSemaphore(NULL, 0, 32768, NULL)) == NULL)
+#define sem_wait(sem) \
+ (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem, INFINITE))
+#define sem_post(sem) ReleaseSemaphore(*sem, 1, NULL)
+#define sem_destroy(sem) \
+ if (*sem) ((int)(CloseHandle(*sem)) == TRUE)
+#define thread_sleep(nms) Sleep(nms)
+
+#elif defined(__OS2__)
+typedef struct {
+ HEV event;
+ HMTX wait_mutex;
+ HMTX count_mutex;
+ int count;
+} sem_t;
+
+static inline int sem_init(sem_t *sem, int pshared, unsigned int value) {
+ DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
+ value > 0 ? TRUE : FALSE);
+ DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
+ DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
+
+ sem->count = value;
+
+ return 0;
+}
+
+static inline int sem_wait(sem_t *sem) {
+ DosRequestMutexSem(sem->wait_mutex, -1);
+
+ DosWaitEventSem(sem->event, -1);
+
+ DosRequestMutexSem(sem->count_mutex, -1);
+
+ sem->count--;
+ if (sem->count == 0) {
+ ULONG post_count;
+
+ DosResetEventSem(sem->event, &post_count);
+ }
+
+ DosReleaseMutexSem(sem->count_mutex);
+
+ DosReleaseMutexSem(sem->wait_mutex);
+
+ return 0;
+}
+
+static inline int sem_post(sem_t *sem) {
+ DosRequestMutexSem(sem->count_mutex, -1);
+
+ if (sem->count < 32768) {
+ sem->count++;
+ DosPostEventSem(sem->event);
+ }
+
+ DosReleaseMutexSem(sem->count_mutex);
+
+ return 0;
+}
+
+static inline int sem_destroy(sem_t *sem) {
+ DosCloseEventSem(sem->event);
+ DosCloseMutexSem(sem->wait_mutex);
+ DosCloseMutexSem(sem->count_mutex);
+
+ return 0;
+}
+
+#define thread_sleep(nms) DosSleep(nms)
+
+#else
+
+#ifdef __APPLE__
+#define sem_t semaphore_t
+#define sem_init(X, Y, Z) \
+ semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
+#define sem_wait(sem) (semaphore_wait(*sem))
+#define sem_post(sem) semaphore_signal(*sem)
+#define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
+#else
+#include <unistd.h>
+#include <sched.h>
+#endif /* __APPLE__ */
+/* Not Windows. Assume pthreads */
+
+/* thread_sleep implementation: yield unless Linux/Unix. */
+#if defined(__unix__) || defined(__APPLE__)
+#define thread_sleep(nms)
+/* {struct timespec ts;ts.tv_sec=0;
+ ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
+#else
+#define thread_sleep(nms) sched_yield();
+#endif /* __unix__ || __APPLE__ */
+
+#endif
+
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+#include "vpx_ports/x86.h"
+#else
+#define x86_pause_hint()
+#endif
+
+#include "vpx_util/vpx_thread.h"
+#include "vpx_util/vpx_atomics.h"
+
+static INLINE void vp8_atomic_spin_wait(
+ int mb_col, const vpx_atomic_int *last_row_current_mb_col,
+ const int nsync) {
+ while (mb_col > (vpx_atomic_load_acquire(last_row_current_mb_col) - nsync)) {
+ x86_pause_hint();
+ thread_sleep(0);
+ }
+}
+
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_THREADING_H_
diff --git a/media/libvpx/libvpx/vp8/common/treecoder.c b/media/libvpx/libvpx/vp8/common/treecoder.c
new file mode 100644
index 0000000000..f1e78f4321
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/treecoder.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "vp8/common/treecoder.h"
+#include "vpx/vpx_integer.h"
+
+static void tree2tok(struct vp8_token_struct *const p, vp8_tree t, int i, int v,
+ int L) {
+ v += v;
+ ++L;
+
+ do {
+ const vp8_tree_index j = t[i++];
+
+ if (j <= 0) {
+ p[-j].value = v;
+ p[-j].Len = L;
+ } else {
+ tree2tok(p, t, j, v, L);
+ }
+ } while (++v & 1);
+}
+
+void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t) {
+ tree2tok(p, t, 0, 0, 0);
+}
+
+void vp8_tokens_from_tree_offset(struct vp8_token_struct *p, vp8_tree t,
+ int offset) {
+ tree2tok(p - offset, t, 0, 0, 0);
+}
+
+static void branch_counts(int n, /* n = size of alphabet */
+ vp8_token tok[/* n */], vp8_tree tree,
+ unsigned int branch_ct[/* n-1 */][2],
+ const unsigned int num_events[/* n */]) {
+ const int tree_len = n - 1;
+ int t = 0;
+
+ assert(tree_len);
+
+ do {
+ branch_ct[t][0] = branch_ct[t][1] = 0;
+ } while (++t < tree_len);
+
+ t = 0;
+
+ do {
+ int L = tok[t].Len;
+ const int enc = tok[t].value;
+ const unsigned int ct = num_events[t];
+
+ vp8_tree_index i = 0;
+
+ do {
+ const int b = (enc >> --L) & 1;
+ const int j = i >> 1;
+ assert(j < tree_len && 0 <= L);
+
+ branch_ct[j][b] += ct;
+ i = tree[i + b];
+ } while (i > 0);
+
+ assert(!L);
+ } while (++t < n);
+}
+
+void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */
+ vp8_token tok[/* n */], vp8_tree tree,
+ vp8_prob probs[/* n-1 */],
+ unsigned int branch_ct[/* n-1 */][2],
+ const unsigned int num_events[/* n */],
+ unsigned int Pfactor, int Round) {
+ const int tree_len = n - 1;
+ int t = 0;
+
+ branch_counts(n, tok, tree, branch_ct, num_events);
+
+ do {
+ const unsigned int *const c = branch_ct[t];
+ const unsigned int tot = c[0] + c[1];
+
+ if (tot) {
+ const unsigned int p =
+ (unsigned int)(((uint64_t)c[0] * Pfactor) + (Round ? tot >> 1 : 0)) /
+ tot;
+ probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
+ } else {
+ probs[t] = vp8_prob_half;
+ }
+ } while (++t < tree_len);
+}
diff --git a/media/libvpx/libvpx/vp8/common/treecoder.h b/media/libvpx/libvpx/vp8/common/treecoder.h
new file mode 100644
index 0000000000..d7d8d0ead0
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/treecoder.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_TREECODER_H_
+#define VPX_VP8_COMMON_TREECODER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned char vp8bc_index_t; /* probability index */
+
+typedef unsigned char vp8_prob;
+
+#define vp8_prob_half ((vp8_prob)128)
+
+typedef signed char vp8_tree_index;
+struct bool_coder_spec;
+
+typedef struct bool_coder_spec bool_coder_spec;
+typedef struct bool_writer bool_writer;
+typedef struct bool_reader bool_reader;
+
+typedef const bool_coder_spec c_bool_coder_spec;
+typedef const bool_writer c_bool_writer;
+typedef const bool_reader c_bool_reader;
+
+#define vp8_complement(x) (255 - (x))
+
+/* We build coding trees compactly in arrays.
+ Each node of the tree is a pair of vp8_tree_indices.
+ Array index often references a corresponding probability table.
+ Index <= 0 means done encoding/decoding and value = -Index,
+ Index > 0 means need another bit, specification at index.
+ Nonnegative indices are always even; processing begins at node 0. */
+
+typedef const vp8_tree_index vp8_tree[], *vp8_tree_p;
+
+typedef const struct vp8_token_struct {
+ int value;
+ int Len;
+} vp8_token;
+
+/* Construct encoding array from tree. */
+
+void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree);
+void vp8_tokens_from_tree_offset(struct vp8_token_struct *, vp8_tree,
+ int offset);
+
+/* Convert array of token occurrence counts into a table of probabilities
+ for the associated binary encoding tree. Also writes count of branches
+ taken for each node on the tree; this facilitiates decisions as to
+ probability updates. */
+
+void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */
+ vp8_token tok[/* n */], vp8_tree tree,
+ vp8_prob probs[/* n-1 */],
+ unsigned int branch_ct[/* n-1 */][2],
+ const unsigned int num_events[/* n */],
+ unsigned int Pfactor, int Round);
+
+/* Variant of above using coder spec rather than hardwired 8-bit probs. */
+
+void vp8bc_tree_probs_from_distribution(int n, /* n = size of alphabet */
+ vp8_token tok[/* n */], vp8_tree tree,
+ vp8_prob probs[/* n-1 */],
+ unsigned int branch_ct[/* n-1 */][2],
+ const unsigned int num_events[/* n */],
+ c_bool_coder_spec *s);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_TREECODER_H_
diff --git a/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h b/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h
new file mode 100644
index 0000000000..3fc942e050
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+#define VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*Generated file, included by entropymode.c*/
+
+const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES] = {
+ { 0, 1 }, { 2, 2 }, { 6, 3 }, { 28, 5 }, { 30, 5 },
+ { 58, 6 }, { 59, 6 }, { 62, 6 }, { 126, 7 }, { 127, 7 }
+};
+
+const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES] = {
+ { 0, 1 }, { 4, 3 }, { 5, 3 }, { 6, 3 }, { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES] = {
+ { 4, 3 }, { 5, 3 }, { 6, 3 }, { 7, 3 }, { 0, 1 }
+};
+
+const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES] = {
+ { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS] = {
+ { 6, 3 }, { 7, 3 }, { 2, 2 }, { 0, 1 }
+};
+
+const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS] = {
+ { 2, 2 }, { 6, 3 }, { 0, 1 }, { 14, 4 }, { 15, 4 }
+};
+
+const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS] = {
+ { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_small_mvencodings[8] = {
+ { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 }, { 4, 3 }, { 5, 3 }, { 6, 3 }, { 7, 3 }
+};
+
+const vp8_prob vp8_ymode_prob[VP8_YMODES - 1] = { 112, 86, 140, 37 };
+
+const vp8_prob vp8_kf_ymode_prob[VP8_YMODES - 1] = { 145, 156, 163, 128 };
+
+const vp8_prob vp8_uv_mode_prob[VP8_UV_MODES - 1] = { 162, 101, 204 };
+
+const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES - 1] = { 142, 114, 183 };
+
+const vp8_prob vp8_bmode_prob[VP8_BINTRAMODES - 1] = { 120, 90, 79, 133, 87,
+ 85, 80, 111, 151 };
+
+const vp8_prob
+ vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES][VP8_BINTRAMODES - 1] = {
+ { { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
+ { 152, 179, 64, 126, 170, 118, 46, 70, 95 },
+ { 175, 69, 143, 80, 85, 82, 72, 155, 103 },
+ { 56, 58, 10, 171, 218, 189, 17, 13, 152 },
+ { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
+ { 114, 26, 17, 163, 44, 195, 21, 10, 173 },
+ { 121, 24, 80, 195, 26, 62, 44, 64, 85 },
+ { 170, 46, 55, 19, 136, 160, 33, 206, 71 },
+ { 63, 20, 8, 114, 114, 208, 12, 9, 226 },
+ { 81, 40, 11, 96, 182, 84, 29, 16, 36 } },
+ { { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
+ { 72, 187, 100, 130, 157, 111, 32, 75, 80 },
+ { 66, 102, 167, 99, 74, 62, 40, 234, 128 },
+ { 41, 53, 9, 178, 241, 141, 26, 8, 107 },
+ { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
+ { 74, 43, 26, 146, 73, 166, 49, 23, 157 },
+ { 65, 38, 105, 160, 51, 52, 31, 115, 128 },
+ { 87, 68, 71, 44, 114, 51, 15, 186, 23 },
+ { 47, 41, 14, 110, 182, 183, 21, 17, 194 },
+ { 66, 45, 25, 102, 197, 189, 23, 18, 22 } },
+ { { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
+ { 43, 97, 183, 117, 85, 38, 35, 179, 61 },
+ { 39, 53, 200, 87, 26, 21, 43, 232, 171 },
+ { 56, 34, 51, 104, 114, 102, 29, 93, 77 },
+ { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
+ { 39, 28, 85, 171, 58, 165, 90, 98, 64 },
+ { 34, 22, 116, 206, 23, 34, 43, 166, 73 },
+ { 68, 25, 106, 22, 64, 171, 36, 225, 114 },
+ { 34, 19, 21, 102, 132, 188, 16, 76, 124 },
+ { 62, 18, 78, 95, 85, 57, 50, 48, 51 } },
+ { { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
+ { 60, 148, 31, 172, 219, 228, 21, 18, 111 },
+ { 112, 113, 77, 85, 179, 255, 38, 120, 114 },
+ { 40, 42, 1, 196, 245, 209, 10, 25, 109 },
+ { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
+ { 88, 43, 29, 140, 166, 213, 37, 43, 154 },
+ { 61, 63, 30, 155, 67, 45, 68, 1, 209 },
+ { 142, 78, 78, 16, 255, 128, 34, 197, 171 },
+ { 41, 40, 5, 102, 211, 183, 4, 1, 221 },
+ { 51, 50, 17, 168, 209, 192, 23, 25, 82 } },
+ { { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
+ { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
+ { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
+ { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
+ { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
+ { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
+ { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
+ { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
+ { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
+ { 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
+ { { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
+ { 67, 87, 58, 169, 82, 115, 26, 59, 179 },
+ { 63, 59, 90, 180, 59, 166, 93, 73, 154 },
+ { 40, 40, 21, 116, 143, 209, 34, 39, 175 },
+ { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
+ { 47, 15, 16, 183, 34, 223, 49, 45, 183 },
+ { 46, 17, 33, 183, 6, 98, 15, 32, 183 },
+ { 65, 32, 73, 115, 28, 128, 23, 128, 205 },
+ { 40, 3, 9, 115, 51, 192, 18, 6, 223 },
+ { 87, 37, 9, 115, 59, 77, 64, 21, 47 } },
+ { { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
+ { 64, 90, 70, 205, 40, 41, 23, 26, 57 },
+ { 54, 57, 112, 184, 5, 41, 38, 166, 213 },
+ { 30, 34, 26, 133, 152, 116, 10, 32, 134 },
+ { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
+ { 39, 19, 53, 221, 26, 114, 32, 73, 255 },
+ { 31, 9, 65, 234, 2, 15, 1, 118, 73 },
+ { 88, 31, 35, 67, 102, 85, 55, 186, 85 },
+ { 56, 21, 23, 111, 59, 205, 45, 37, 192 },
+ { 55, 38, 70, 124, 73, 102, 1, 34, 98 } },
+ { { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
+ { 69, 60, 71, 38, 73, 119, 28, 222, 37 },
+ { 68, 45, 128, 34, 1, 47, 11, 245, 171 },
+ { 62, 17, 19, 70, 146, 85, 55, 62, 70 },
+ { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
+ { 37, 43, 37, 154, 100, 163, 85, 160, 1 },
+ { 63, 9, 92, 136, 28, 64, 32, 201, 85 },
+ { 86, 6, 28, 5, 64, 255, 25, 248, 1 },
+ { 56, 8, 17, 132, 137, 255, 55, 116, 128 },
+ { 58, 15, 20, 82, 135, 57, 26, 121, 40 } },
+ { { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
+ { 51, 103, 44, 131, 131, 123, 31, 6, 158 },
+ { 86, 40, 64, 135, 148, 224, 45, 183, 128 },
+ { 22, 26, 17, 131, 240, 154, 14, 1, 209 },
+ { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
+ { 45, 16, 21, 91, 64, 222, 7, 1, 197 },
+ { 56, 21, 39, 155, 60, 138, 23, 102, 213 },
+ { 85, 26, 85, 85, 128, 128, 32, 146, 171 },
+ { 18, 11, 7, 63, 144, 171, 4, 4, 246 },
+ { 35, 27, 10, 146, 174, 171, 12, 26, 128 } },
+ { { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
+ { 85, 126, 47, 87, 176, 51, 41, 20, 32 },
+ { 101, 75, 128, 139, 118, 146, 116, 128, 85 },
+ { 56, 41, 15, 176, 236, 85, 37, 9, 62 },
+ { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
+ { 71, 30, 17, 119, 118, 255, 17, 18, 138 },
+ { 101, 38, 60, 138, 55, 70, 43, 26, 142 },
+ { 138, 45, 61, 62, 219, 1, 81, 188, 64 },
+ { 32, 41, 20, 117, 151, 142, 20, 21, 163 },
+ { 112, 19, 12, 61, 195, 128, 48, 4, 24 } }
+ };
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_
diff --git a/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c b/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c
new file mode 100644
index 0000000000..9c9e5f351b
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c
@@ -0,0 +1,566 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "loopfilter.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+static void lf_init_lut(loop_filter_info_n *lfi) {
+ int filt_lvl;
+
+ for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; ++filt_lvl) {
+ if (filt_lvl >= 40) {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
+ } else if (filt_lvl >= 20) {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
+ } else if (filt_lvl >= 15) {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
+ } else {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
+ }
+ }
+
+ lfi->mode_lf_lut[DC_PRED] = 1;
+ lfi->mode_lf_lut[V_PRED] = 1;
+ lfi->mode_lf_lut[H_PRED] = 1;
+ lfi->mode_lf_lut[TM_PRED] = 1;
+ lfi->mode_lf_lut[B_PRED] = 0;
+
+ lfi->mode_lf_lut[ZEROMV] = 1;
+ lfi->mode_lf_lut[NEARESTMV] = 2;
+ lfi->mode_lf_lut[NEARMV] = 2;
+ lfi->mode_lf_lut[NEWMV] = 2;
+ lfi->mode_lf_lut[SPLITMV] = 3;
+}
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+ int sharpness_lvl) {
+ int i;
+
+ /* For each possible value for the loop filter fill out limits */
+ for (i = 0; i <= MAX_LOOP_FILTER; ++i) {
+ int filt_lvl = i;
+ int block_inside_limit = 0;
+
+ /* Set loop filter paramaeters that control sharpness. */
+ block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
+ block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+
+ if (sharpness_lvl > 0) {
+ if (block_inside_limit > (9 - sharpness_lvl)) {
+ block_inside_limit = (9 - sharpness_lvl);
+ }
+ }
+
+ if (block_inside_limit < 1) block_inside_limit = 1;
+
+ memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+ memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH);
+ memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+ SIMD_WIDTH);
+ }
+}
+
+void vp8_loop_filter_init(VP8_COMMON *cm) {
+ loop_filter_info_n *lfi = &cm->lf_info;
+ int i;
+
+ /* init limits for given sharpness*/
+ vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+ cm->last_sharpness_level = cm->sharpness_level;
+
+ /* init LUT for lvl and hev thr picking */
+ lf_init_lut(lfi);
+
+ /* init hev threshold const vectors */
+ for (i = 0; i < 4; ++i) {
+ memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+ }
+}
+
+void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd,
+ int default_filt_lvl) {
+ int seg, /* segment number */
+ ref, /* index in ref_lf_deltas */
+ mode; /* index in mode_lf_deltas */
+
+ loop_filter_info_n *lfi = &cm->lf_info;
+
+ /* update limits if sharpness has changed */
+ if (cm->last_sharpness_level != cm->sharpness_level) {
+ vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+ cm->last_sharpness_level = cm->sharpness_level;
+ }
+
+ for (seg = 0; seg < MAX_MB_SEGMENTS; ++seg) {
+ int lvl_seg = default_filt_lvl;
+ int lvl_ref, lvl_mode;
+
+ /* Note the baseline filter values for each segment */
+ if (mbd->segmentation_enabled) {
+ if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+ lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+ } else { /* Delta Value */
+ lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+ }
+ lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;
+ }
+
+ if (!mbd->mode_ref_lf_delta_enabled) {
+ /* we could get rid of this if we assume that deltas are set to
+ * zero when not in use; encoder always uses deltas
+ */
+ memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);
+ continue;
+ }
+
+ /* INTRA_FRAME */
+ ref = INTRA_FRAME;
+
+ /* Apply delta for reference frame */
+ lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
+
+ /* Apply delta for Intra modes */
+ mode = 0; /* B_PRED */
+ /* Only the split mode BPRED has a further special case */
+ lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+ /* clamp */
+ lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
+
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+
+ mode = 1; /* all the rest of Intra modes */
+ /* clamp */
+ lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0;
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+
+ /* LAST, GOLDEN, ALT */
+ for (ref = 1; ref < MAX_REF_FRAMES; ++ref) {
+ /* Apply delta for reference frame */
+ lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
+
+ /* Apply delta for Inter modes */
+ for (mode = 1; mode < 4; ++mode) {
+ lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+ /* clamp */
+ lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
+
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+ }
+ }
+ }
+}
+
+void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+ int mb_row, int post_ystride, int post_uvstride,
+ unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr) {
+ int mb_col;
+ int filter_level;
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level) {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride,
+ &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride,
+ &lfi);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride,
+ &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh(y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride,
+ &lfi);
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+}
+
+void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+ int mb_row, int post_ystride,
+ unsigned char *y_ptr) {
+ int mb_col;
+ int filter_level;
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level) {
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv(y_ptr, post_ystride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv(y_ptr, post_ystride,
+ lfi_n->blim[filter_level]);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_simple_mbh(y_ptr, post_ystride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh(y_ptr, post_ystride,
+ lfi_n->blim[filter_level]);
+ }
+
+ y_ptr += 16;
+
+ mode_info_context++; /* step to next MB */
+ }
+}
+void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int frame_type) {
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+
+ int mb_row;
+ int mb_col;
+ int mb_rows = cm->mb_rows;
+ int mb_cols = cm->mb_cols;
+
+ int filter_level;
+
+ unsigned char *y_ptr, *u_ptr, *v_ptr;
+
+ /* Point at base of Mb MODE_INFO list */
+ const MODE_INFO *mode_info_context = cm->mi;
+ int post_y_stride = post->y_stride;
+ int post_uv_stride = post->uv_stride;
+
+ /* Initialize the loop filter for this frame. */
+ vp8_loop_filter_frame_init(cm, mbd, cm->filter_level);
+
+ /* Set up the buffer pointers */
+ y_ptr = post->y_buffer;
+ u_ptr = post->u_buffer;
+ v_ptr = post->v_buffer;
+
+ /* vp8_filter each macro block */
+ if (cm->filter_type == NORMAL_LOOPFILTER) {
+ for (mb_row = 0; mb_row < mb_rows; ++mb_row) {
+ for (mb_col = 0; mb_col < mb_cols; ++mb_col) {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level) {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post_y_stride,
+ post_uv_stride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv(y_ptr, u_ptr, v_ptr, post_y_stride,
+ post_uv_stride, &lfi);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post_y_stride,
+ post_uv_stride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh(y_ptr, u_ptr, v_ptr, post_y_stride,
+ post_uv_stride, &lfi);
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+ y_ptr += post_y_stride * 16 - post->y_width;
+ u_ptr += post_uv_stride * 8 - post->uv_width;
+ v_ptr += post_uv_stride * 8 - post->uv_width;
+
+ mode_info_context++; /* Skip border mb */
+ }
+ } else { /* SIMPLE_LOOPFILTER */
+ for (mb_row = 0; mb_row < mb_rows; ++mb_row) {
+ for (mb_col = 0; mb_col < mb_cols; ++mb_col) {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+ if (filter_level) {
+ const unsigned char *mblim = lfi_n->mblim[filter_level];
+ const unsigned char *blim = lfi_n->blim[filter_level];
+
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv(y_ptr, post_y_stride, mblim);
+
+ if (!skip_lf) vp8_loop_filter_simple_bv(y_ptr, post_y_stride, blim);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_simple_mbh(y_ptr, post_y_stride, mblim);
+
+ if (!skip_lf) vp8_loop_filter_simple_bh(y_ptr, post_y_stride, blim);
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+ y_ptr += post_y_stride * 16 - post->y_width;
+ u_ptr += post_uv_stride * 8 - post->uv_width;
+ v_ptr += post_uv_stride * 8 - post->uv_width;
+
+ mode_info_context++; /* Skip border mb */
+ }
+ }
+}
+
+void vp8_loop_filter_frame_yonly(VP8_COMMON *cm, MACROBLOCKD *mbd,
+ int default_filt_lvl) {
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+ unsigned char *y_ptr;
+ int mb_row;
+ int mb_col;
+
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+
+ int filter_level;
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ /* Point at base of Mb MODE_INFO list */
+ const MODE_INFO *mode_info_context = cm->mi;
+
+#if 0
+ if(default_filt_lvl == 0) /* no filter applied */
+ return;
+#endif
+
+ /* Initialize the loop filter for this frame. */
+ vp8_loop_filter_frame_init(cm, mbd, default_filt_lvl);
+
+ /* Set up the buffer pointers */
+ y_ptr = post->y_buffer;
+
+ /* vp8_filter each macro block */
+ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level) {
+ if (cm->filter_type == NORMAL_LOOPFILTER) {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+ } else {
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv(y_ptr, post->y_stride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv(y_ptr, post->y_stride,
+ lfi_n->blim[filter_level]);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_simple_mbh(y_ptr, post->y_stride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh(y_ptr, post->y_stride,
+ lfi_n->blim[filter_level]);
+ }
+ }
+
+ y_ptr += 16;
+ mode_info_context++; /* step to next MB */
+ }
+
+ y_ptr += post->y_stride * 16 - post->y_width;
+ mode_info_context++; /* Skip border mb */
+ }
+}
+
+void vp8_loop_filter_partial_frame(VP8_COMMON *cm, MACROBLOCKD *mbd,
+ int default_filt_lvl) {
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+ unsigned char *y_ptr;
+ int mb_row;
+ int mb_col;
+ int mb_cols = post->y_width >> 4;
+ int mb_rows = post->y_height >> 4;
+
+ int linestocopy;
+
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+
+ int filter_level;
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ const MODE_INFO *mode_info_context;
+
+#if 0
+ if(default_filt_lvl == 0) /* no filter applied */
+ return;
+#endif
+
+ /* Initialize the loop filter for this frame. */
+ vp8_loop_filter_frame_init(cm, mbd, default_filt_lvl);
+
+ /* number of MB rows to use in partial filtering */
+ linestocopy = mb_rows / PARTIAL_FRAME_FRACTION;
+ linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */
+
+ /* Set up the buffer pointers; partial image starts at ~middle of frame */
+ y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride;
+ mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
+
+ /* vp8_filter each macro block */
+ for (mb_row = 0; mb_row < (linestocopy >> 4); ++mb_row) {
+ for (mb_col = 0; mb_col < mb_cols; ++mb_col) {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level) {
+ if (cm->filter_type == NORMAL_LOOPFILTER) {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ vp8_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+ } else {
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv(y_ptr, post->y_stride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv(y_ptr, post->y_stride,
+ lfi_n->blim[filter_level]);
+
+ vp8_loop_filter_simple_mbh(y_ptr, post->y_stride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh(y_ptr, post->y_stride,
+ lfi_n->blim[filter_level]);
+ }
+ }
+
+ y_ptr += 16;
+ mode_info_context += 1; /* step to next MB */
+ }
+
+ y_ptr += post->y_stride * 16 - post->y_width;
+ mode_info_context += 1; /* Skip border mb */
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c
new file mode 100644
index 0000000000..6739efa5fe
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/vp8_skin_detection.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
+
+static int avg_2x2(const uint8_t *s, int p) {
+ int i, j;
+ int sum = 0;
+ for (i = 0; i < 2; ++i, s += p) {
+ for (j = 0; j < 2; ++j) {
+ sum += s[j];
+ }
+ }
+ return (sum + 2) >> 2;
+}
+
+int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+ int stride, int strideuv,
+ SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv,
+ int curr_motion_magn) {
+ // No skin if block has been zero/small motion for long consecutive time.
+ if (consec_zeromv > 60 && curr_motion_magn == 0) {
+ return 0;
+ } else {
+ int motion = 1;
+ if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
+ if (bsize == SKIN_16X16) {
+ // Take the average of center 2x2 pixels.
+ const int ysource = avg_2x2(y + 7 * stride + 7, stride);
+ const int usource = avg_2x2(u + 3 * strideuv + 3, strideuv);
+ const int vsource = avg_2x2(v + 3 * strideuv + 3, strideuv);
+ return vpx_skin_pixel(ysource, usource, vsource, motion);
+ } else {
+ int num_skin = 0;
+ int i, j;
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ // Take the average of center 2x2 pixels.
+ const int ysource = avg_2x2(y + 3 * stride + 3, stride);
+ const int usource = avg_2x2(u + strideuv + 1, strideuv);
+ const int vsource = avg_2x2(v + strideuv + 1, strideuv);
+ num_skin += vpx_skin_pixel(ysource, usource, vsource, motion);
+ if (num_skin >= 2) return 1;
+ y += 8;
+ u += 4;
+ v += 4;
+ }
+ y += (stride << 3) - 16;
+ u += (strideuv << 2) - 8;
+ v += (strideuv << 2) - 8;
+ }
+
+ return 0;
+ }
+ }
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp8_compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) {
+ int i, j, mb_row, mb_col, num_bl;
+ VP8_COMMON *const cm = &cpi->common;
+ uint8_t *y;
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const int src_ystride = cpi->Source->y_stride;
+ int offset = 0;
+
+ YV12_BUFFER_CONFIG skinmap;
+ memset(&skinmap, 0, sizeof(skinmap));
+ if (vp8_yv12_alloc_frame_buffer(&skinmap, cm->Width, cm->Height,
+ VP8BORDERINPIXELS) < 0) {
+ vpx_free_frame_buffer(&skinmap);
+ return;
+ }
+ memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+ y = skinmap.y_buffer;
+ // Loop through blocks and set skin map based on center pixel of block.
+ // Set y to white for skin block, otherwise set to source with gray scale.
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) {
+ num_bl = 0;
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) {
+ const int is_skin = cpi->skin_map[offset++];
+ for (i = 0; i < 16; i++) {
+ for (j = 0; j < 16; j++) {
+ y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
+ }
+ }
+ num_bl++;
+ y += 16;
+ src_y += 16;
+ }
+ y += (src_ystride << 4) - (num_bl << 4);
+ src_y += (src_ystride << 4) - (num_bl << 4);
+ }
+ vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
+ vpx_free_frame_buffer(&skinmap);
+}
+#endif // OUTPUT_YUV_SKINMAP
diff --git a/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h
new file mode 100644
index 0000000000..ef0e4ae4fe
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_
+#define VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_
+
+#include "vp8/encoder/onyx_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+typedef enum {
+ // Skin detection based on 8x8 block. If two of them are identified as skin,
+ // the macroblock is marked as skin.
+ SKIN_8X8,
+ // Skin detection based on 16x16 block.
+ SKIN_16X16
+} SKIN_DETECTION_BLOCK_SIZE;
+
+int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+ int stride, int strideuv,
+ SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv,
+ int curr_motion_magn);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp8_compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_
diff --git a/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c b/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c
new file mode 100644
index 0000000000..ff6cbbd68c
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <xmmintrin.h>
+
+#include "./vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "vp8/common/filter.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void horizontal_16x16(uint8_t *src, const int stride,
+ uint16_t *dst, const int xoffset) {
+ int h;
+ const __m128i zero = _mm_setzero_si128();
+
+ if (xoffset == 0) {
+ for (h = 0; h < 17; ++h) {
+ const __m128i a = _mm_loadu_si128((__m128i *)src);
+ const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
+ const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
+ _mm_store_si128((__m128i *)dst, a_lo);
+ _mm_store_si128((__m128i *)(dst + 8), a_hi);
+ src += stride;
+ dst += 16;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+ const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+ for (h = 0; h < 17; ++h) {
+ const __m128i a = _mm_loadu_si128((__m128i *)src);
+ const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
+ const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
+ const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0);
+ const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0);
+
+ const __m128i b = _mm_loadu_si128((__m128i *)(src + 1));
+ const __m128i b_lo = _mm_unpacklo_epi8(b, zero);
+ const __m128i b_hi = _mm_unpackhi_epi8(b, zero);
+ const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1);
+ const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1);
+
+ const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered);
+ const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered);
+
+ const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
+ const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
+
+ const __m128i shifted_lo =
+ _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
+ const __m128i shifted_hi =
+ _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
+
+ _mm_store_si128((__m128i *)dst, shifted_lo);
+ _mm_store_si128((__m128i *)(dst + 8), shifted_hi);
+ src += stride;
+ dst += 16;
+ }
+ }
+}
+
+static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride,
+ const int yoffset) {
+ int h;
+
+ if (yoffset == 0) {
+ for (h = 0; h < 16; ++h) {
+ const __m128i row_lo = _mm_load_si128((__m128i *)src);
+ const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8));
+ const __m128i packed = _mm_packus_epi16(row_lo, row_hi);
+ _mm_store_si128((__m128i *)dst, packed);
+ src += 16;
+ dst += stride;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+ const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+ __m128i row_0_lo = _mm_load_si128((__m128i *)src);
+ __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8));
+ src += 16;
+ for (h = 0; h < 16; ++h) {
+ const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0);
+ const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0);
+
+ const __m128i row_1_lo = _mm_load_si128((__m128i *)src);
+ const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8));
+ const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1);
+ const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1);
+
+ const __m128i sum_lo =
+ _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered);
+ const __m128i sum_hi =
+ _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered);
+
+ const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
+ const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
+
+ const __m128i shifted_lo =
+ _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
+ const __m128i shifted_hi =
+ _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
+
+ const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi);
+ _mm_store_si128((__m128i *)dst, packed);
+ row_0_lo = row_1_lo;
+ row_0_hi = row_1_hi;
+ src += 16;
+ dst += stride;
+ }
+ }
+}
+
+void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, uint8_t *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED(16, uint16_t, FData[16 * 17]);
+
+ assert((xoffset | yoffset) != 0);
+
+ horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset);
+
+ vertical_16x16(FData, dst_ptr, dst_pitch, yoffset);
+}
+
+static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst,
+ const int xoffset, const int height) {
+ int h;
+ const __m128i zero = _mm_setzero_si128();
+
+ if (xoffset == 0) {
+ for (h = 0; h < height; ++h) {
+ const __m128i a = _mm_loadl_epi64((__m128i *)src);
+ const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+ _mm_store_si128((__m128i *)dst, a_u16);
+ src += stride;
+ dst += 8;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+ const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+ // Filter horizontally. Rather than load the whole array and transpose, load
+ // 16 values (overreading) and shift to set up the second value. Do an
+ // "extra" 9th line so the vertical pass has the necessary context.
+ for (h = 0; h < height; ++h) {
+ const __m128i a = _mm_loadu_si128((__m128i *)src);
+ const __m128i b = _mm_srli_si128(a, 1);
+ const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+ const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
+ const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
+ const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
+ const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
+ const __m128i compensated = _mm_add_epi16(sum, round_factor);
+ const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+ _mm_store_si128((__m128i *)dst, shifted);
+ src += stride;
+ dst += 8;
+ }
+ }
+}
+
+static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride,
+ const int yoffset, const int height) {
+ int h;
+
+ if (yoffset == 0) {
+ for (h = 0; h < height; ++h) {
+ const __m128i row = _mm_load_si128((__m128i *)src);
+ const __m128i packed = _mm_packus_epi16(row, row);
+ _mm_storel_epi64((__m128i *)dst, packed);
+ src += 8;
+ dst += stride;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+ const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+ __m128i row_0 = _mm_load_si128((__m128i *)src);
+ src += 8;
+ for (h = 0; h < height; ++h) {
+ const __m128i row_1 = _mm_load_si128((__m128i *)src);
+ const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
+ const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
+ const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
+ const __m128i compensated = _mm_add_epi16(sum, round_factor);
+ const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+ const __m128i packed = _mm_packus_epi16(shifted, shifted);
+ _mm_storel_epi64((__m128i *)dst, packed);
+ row_0 = row_1;
+ src += 8;
+ dst += stride;
+ }
+ }
+}
+
+void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, uint8_t *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED(16, uint16_t, FData[8 * 9]);
+
+ assert((xoffset | yoffset) != 0);
+
+ horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9);
+
+ vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8);
+}
+
+void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, uint8_t *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED(16, uint16_t, FData[8 * 5]);
+
+ assert((xoffset | yoffset) != 0);
+
+ horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5);
+
+ vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4);
+}
+
+static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst,
+ const int xoffset) {
+ int h;
+ const __m128i zero = _mm_setzero_si128();
+
+ if (xoffset == 0) {
+ for (h = 0; h < 5; ++h) {
+ const __m128i a = load_unaligned_u32(src);
+ const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+ _mm_storel_epi64((__m128i *)dst, a_u16);
+ src += stride;
+ dst += 4;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+ const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+ for (h = 0; h < 5; ++h) {
+ const __m128i a = load_unaligned_u32(src);
+ const __m128i b = load_unaligned_u32(src + 1);
+ const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+ const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
+ const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
+ const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
+ const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
+ const __m128i compensated = _mm_add_epi16(sum, round_factor);
+ const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+ _mm_storel_epi64((__m128i *)dst, shifted);
+ src += stride;
+ dst += 4;
+ }
+ }
+}
+
+static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride,
+ const int yoffset) {
+ int h;
+
+ if (yoffset == 0) {
+ for (h = 0; h < 4; h += 2) {
+ const __m128i row = _mm_load_si128((__m128i *)src);
+ __m128i packed = _mm_packus_epi16(row, row);
+ store_unaligned_u32(dst, packed);
+ dst += stride;
+ packed = _mm_srli_si128(packed, 4);
+ store_unaligned_u32(dst, packed);
+ dst += stride;
+ src += 8;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+ const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+ for (h = 0; h < 4; h += 2) {
+ const __m128i row_0 = _mm_load_si128((__m128i *)src);
+ const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4));
+ const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
+ const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
+ const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
+ const __m128i compensated = _mm_add_epi16(sum, round_factor);
+ const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+ __m128i packed = _mm_packus_epi16(shifted, shifted);
+ storeu_int32(dst, _mm_cvtsi128_si32(packed));
+ packed = _mm_srli_si128(packed, 4);
+ dst += stride;
+ storeu_int32(dst, _mm_cvtsi128_si32(packed));
+ dst += stride;
+ src += 8;
+ }
+ }
+}
+
+void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, uint8_t *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED(16, uint16_t, FData[4 * 5]);
+
+ assert((xoffset | yoffset) != 0);
+
+ horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset);
+
+ vertical_4x4(FData, dst_ptr, dst_pitch, yoffset);
+}
diff --git a/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm
new file mode 100644
index 0000000000..0a269e15f7
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm
@@ -0,0 +1,259 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
+globalsym(vp8_dequantize_b_impl_mmx)
+sym(vp8_dequantize_b_impl_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;sq
+ mov rdi, arg(1) ;dq
+ mov rax, arg(2) ;q
+
+ movq mm1, [rsi]
+ pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers.
+ movq [rdi], mm1
+
+ movq mm1, [rsi+8]
+ pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+8], mm1
+
+ movq mm1, [rsi+16]
+ pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+16], mm1
+
+ movq mm1, [rsi+24]
+ pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+24], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void dequant_idct_add_mmx(
+;short *input, 0
+;short *dq, 1
+;unsigned char *dest, 2
+;int stride) 3
+globalsym(vp8_dequant_idct_add_mmx)
+sym(vp8_dequant_idct_add_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ;input
+ mov rdx, arg(1) ;dq
+
+
+ movq mm0, [rax ]
+ pmullw mm0, [rdx]
+
+ movq mm1, [rax +8]
+ pmullw mm1, [rdx +8]
+
+ movq mm2, [rax+16]
+ pmullw mm2, [rdx+16]
+
+ movq mm3, [rax+24]
+ pmullw mm3, [rdx+24]
+
+ mov rdx, arg(2) ;dest
+
+ pxor mm7, mm7
+
+
+ movq [rax], mm7
+ movq [rax+8], mm7
+
+ movq [rax+16],mm7
+ movq [rax+24],mm7
+
+
+ movsxd rdi, dword ptr arg(3) ;stride
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ movq mm3, mm5 ; 33 23 13 03
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ paddw mm0, [GLOBAL(fours)]
+
+ paddw mm2, [GLOBAL(fours)]
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+ psraw mm2, 3
+
+ psraw mm0, 3
+ psraw mm4, 3
+
+ psraw mm6, 3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ pxor mm7, mm7
+
+ movd mm4, [rdx]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
+
+ movd mm4, [rdx+rdi]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rdx+2*rdi]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+
+ movd mm4, [rdx+2*rdi]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+ times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 4 dw 0x4E7B
+align 16
+fours:
+ times 4 dw 0x0004
diff --git a/media/libvpx/libvpx/vp8/common/x86/idct_blk_mmx.c b/media/libvpx/libvpx/vp8/common/x86/idct_blk_mmx.c
new file mode 100644
index 0000000000..fd804b1ca4
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/idct_blk_mmx.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC) {
+ short *sq = (short *)d->qcoeff;
+ short *dq = (short *)d->dqcoeff;
+
+ vp8_dequantize_b_impl_mmx(sq, dq, DQC);
+}
diff --git a/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c b/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c
new file mode 100644
index 0000000000..897ed5b652
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+void vp8_idct_dequant_0_2x_sse2(short *q, short *dq, unsigned char *dst,
+ int dst_stride);
+void vp8_idct_dequant_full_2x_sse2(short *q, short *dq, unsigned char *dst,
+ int dst_stride);
+
+void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst,
+ int stride, char *eobs) {
+ int i;
+
+ for (i = 0; i < 4; ++i) {
+ if (((short *)(eobs))[0]) {
+ if (((short *)(eobs))[0] & 0xfefe) {
+ vp8_idct_dequant_full_2x_sse2(q, dq, dst, stride);
+ } else {
+ vp8_idct_dequant_0_2x_sse2(q, dq, dst, stride);
+ }
+ }
+ if (((short *)(eobs))[1]) {
+ if (((short *)(eobs))[1] & 0xfefe) {
+ vp8_idct_dequant_full_2x_sse2(q + 32, dq, dst + 8, stride);
+ } else {
+ vp8_idct_dequant_0_2x_sse2(q + 32, dq, dst + 8, stride);
+ }
+ }
+ q += 64;
+ dst += stride * 4;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq,
+ unsigned char *dst_u,
+ unsigned char *dst_v, int stride,
+ char *eobs) {
+ if (((short *)(eobs))[0]) {
+ if (((short *)(eobs))[0] & 0xfefe) {
+ vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride);
+ } else {
+ vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride);
+ }
+ }
+ q += 32;
+ dst_u += stride * 4;
+
+ if (((short *)(eobs))[1]) {
+ if (((short *)(eobs))[1] & 0xfefe) {
+ vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride);
+ } else {
+ vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride);
+ }
+ }
+ q += 32;
+
+ if (((short *)(eobs))[2]) {
+ if (((short *)(eobs))[2] & 0xfefe) {
+ vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride);
+ } else {
+ vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride);
+ }
+ }
+ q += 32;
+ dst_v += stride * 4;
+
+ if (((short *)(eobs))[3]) {
+ if (((short *)(eobs))[3] & 0xfefe) {
+ vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride);
+ } else {
+ vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm
new file mode 100644
index 0000000000..6cea86fe03
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm
@@ -0,0 +1,296 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; /****************************************************************************
+; * Notes:
+; *
+; * This implementation makes use of 16 bit fixed point version of two multiply
+; * constants:
+; * 1. sqrt(2) * cos (pi/8)
+; * 2. sqrt(2) * sin (pi/8)
+; * Because the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point precision as the second one, we use a trick of
+; * x * a = x + x*(a-1)
+; * so
+; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+; *
+; * For the second constant, because of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
+; * number.
+; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
+; *
+; **************************************************************************/
+
+SECTION .text
+
+;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
+;int pitch, unsigned char *dest,int stride)
+globalsym(vp8_short_idct4x4llm_mmx)
+sym(vp8_short_idct4x4llm_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ;input
+ mov rsi, arg(1) ;pred
+
+ movq mm0, [rax ]
+ movq mm1, [rax+ 8]
+ movq mm2, [rax+16]
+ movq mm3, [rax+24]
+
+%if 0
+ pxor mm7, mm7
+ movq [rax], mm7
+ movq [rax+8], mm7
+ movq [rax+16],mm7
+ movq [rax+24],mm7
+%endif
+ movsxd rax, dword ptr arg(2) ;pitch
+ mov rdx, arg(3) ;dest
+ movsxd rdi, dword ptr arg(4) ;stride
+
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ movq mm3, mm5 ; 33 23 13 03
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ paddw mm0, [GLOBAL(fours)]
+
+ paddw mm2, [GLOBAL(fours)]
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+ psraw mm2, 3
+
+ psraw mm0, 3
+ psraw mm4, 3
+
+ psraw mm6, 3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ pxor mm7, mm7
+
+ movd mm4, [rsi]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
+
+ movd mm4, [rsi+rax]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+ add rsi, rax
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_dc_only_idct_add_mmx(
+;short input_dc,
+;unsigned char *pred_ptr,
+;int pred_stride,
+;unsigned char *dst_ptr,
+;int stride)
+globalsym(vp8_dc_only_idct_add_mmx)
+sym(vp8_dc_only_idct_add_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ ; end prolog
+
+ movd mm5, arg(0) ;input_dc
+ mov rax, arg(1) ;pred_ptr
+ movsxd rdx, dword ptr arg(2) ;pred_stride
+
+ pxor mm0, mm0
+
+ paddw mm5, [GLOBAL(fours)]
+ lea rcx, [rdx + rdx*2]
+
+ psraw mm5, 3
+
+ punpcklwd mm5, mm5
+
+ punpckldq mm5, mm5
+
+ movd mm1, [rax]
+ movd mm2, [rax+rdx]
+ movd mm3, [rax+2*rdx]
+ movd mm4, [rax+rcx]
+
+ mov rax, arg(3) ;d -- destination
+ movsxd rdx, dword ptr arg(4) ;dst_stride
+
+ punpcklbw mm1, mm0
+ paddsw mm1, mm5
+ packuswb mm1, mm0 ; pack and unpack to saturate
+ lea rcx, [rdx + rdx*2]
+
+ punpcklbw mm2, mm0
+ paddsw mm2, mm5
+ packuswb mm2, mm0 ; pack and unpack to saturate
+
+ punpcklbw mm3, mm0
+ paddsw mm3, mm5
+ packuswb mm3, mm0 ; pack and unpack to saturate
+
+ punpcklbw mm4, mm0
+ paddsw mm4, mm5
+ packuswb mm4, mm0 ; pack and unpack to saturate
+
+ movd [rax], mm1
+ movd [rax+rdx], mm2
+ movd [rax+2*rdx], mm3
+ movd [rax+rcx], mm4
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+ times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 4 dw 0x4E7B
+align 16
+fours:
+ times 4 dw 0x0004
diff --git a/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm
new file mode 100644
index 0000000000..bb79d2da3b
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm
@@ -0,0 +1,710 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_idct_dequant_0_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; )
+
+SECTION .text
+
+globalsym(vp8_idct_dequant_0_2x_sse2)
+sym(vp8_idct_dequant_0_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ ; end prolog
+
+ mov rdx, arg(1) ; dequant
+ mov rax, arg(0) ; qcoeff
+
+ movd xmm4, [rax]
+ movd xmm5, [rdx]
+
+ pinsrw xmm4, [rax+32], 4
+ pinsrw xmm5, [rdx], 4
+
+ pmullw xmm4, xmm5
+
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
+ ; clear coeffs
+ movd [rax], xmm5
+ movd [rax+32], xmm5
+;pshufb
+ mov rax, arg(2) ; dst
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ pshuflw xmm4, xmm4, 00000000b
+ pshufhw xmm4, xmm4, 00000000b
+
+ lea rcx, [rdx + rdx*2]
+ paddw xmm4, [GLOBAL(fours)]
+
+ psraw xmm4, 3
+
+ movq xmm0, [rax]
+ movq xmm1, [rax+rdx]
+ movq xmm2, [rax+2*rdx]
+ movq xmm3, [rax+rcx]
+
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+
+
+ ; Add to predict buffer
+ paddw xmm0, xmm4
+ paddw xmm1, xmm4
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+
+ ; pack up before storing
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
+
+ ; store blocks back out
+ movq [rax], xmm0
+ movq [rax + rdx], xmm1
+
+ lea rax, [rax + 2*rdx]
+
+ movq [rax], xmm2
+ movq [rax + rdx], xmm3
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_idct_dequant_full_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; )
+globalsym(vp8_idct_dequant_full_2x_sse2)
+sym(vp8_idct_dequant_full_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+ mov rdx, arg(1) ; dequant
+ mov rdi, arg(2) ; dst
+
+
+ ; Zero out xmm7, for use unpacking
+ pxor xmm7, xmm7
+
+
+ ; note the transpose of xmm1 and xmm2, necessary for shuffle
+ ; to spit out sensicle data
+ movdqa xmm0, [rax]
+ movdqa xmm2, [rax+16]
+ movdqa xmm1, [rax+32]
+ movdqa xmm3, [rax+48]
+
+ ; Clear out coeffs
+ movdqa [rax], xmm7
+ movdqa [rax+16], xmm7
+ movdqa [rax+32], xmm7
+ movdqa [rax+48], xmm7
+
+ ; dequantize qcoeff buffer
+ pmullw xmm0, [rdx]
+ pmullw xmm2, [rdx+16]
+ pmullw xmm1, [rdx]
+ pmullw xmm3, [rdx+16]
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ ; repack so block 0 row x and block 1 row x are together
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm4, xmm1
+
+ pshufd xmm0, xmm0, 11011000b
+ pshufd xmm1, xmm4, 11011000b
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ pshufd xmm2, xmm2, 11011000b
+ pshufd xmm3, xmm4, 11011000b
+
+ ; first pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2 ;
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ lea rcx, [rdx + rdx*2] ;dst_stride * 3
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+
+ ; transpose for the second pass
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ ; second pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ paddw xmm0, [GLOBAL(fours)]
+
+ paddw xmm2, [GLOBAL(fours)]
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+ psraw xmm2, 3
+
+ psraw xmm0, 3
+ psraw xmm4, 3
+
+ psraw xmm6, 3
+
+ ; transpose to save
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ pxor xmm7, xmm7
+
+ ; Load up predict blocks
+ movq xmm4, [rdi]
+ movq xmm5, [rdi+rdx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movq xmm4, [rdi+2*rdx]
+ movq xmm5, [rdi+rcx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+.finish:
+
+ ; pack up before storing
+ packuswb xmm0, xmm7
+ packuswb xmm1, xmm7
+ packuswb xmm2, xmm7
+ packuswb xmm3, xmm7
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+ movq [rdi + rdx*2], xmm2
+ movq [rdi + rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_idct_dequant_dc_0_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; short *dc - 4
+; )
+globalsym(vp8_idct_dequant_dc_0_2x_sse2)
+sym(vp8_idct_dequant_dc_0_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+
+ mov rdi, arg(2) ; dst
+ mov rdx, arg(4) ; dc
+
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
+ ; load up 2 dc words here == 2*16 = doubleword
+ movd xmm4, [rdx]
+
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+ lea rcx, [rdx + rdx*2]
+ ; Load up predict blocks
+ movq xmm0, [rdi]
+ movq xmm1, [rdi+rdx*1]
+ movq xmm2, [rdi+rdx*2]
+ movq xmm3, [rdi+rcx]
+
+ ; Duplicate and expand dc across
+ punpcklwd xmm4, xmm4
+ punpckldq xmm4, xmm4
+
+ ; Rounding to dequant and downshift
+ paddw xmm4, [GLOBAL(fours)]
+ psraw xmm4, 3
+
+ ; Predict buffer needs to be expanded from bytes to words
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+
+ ; Add to predict buffer
+ paddw xmm0, xmm4
+ paddw xmm1, xmm4
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+
+ ; pack up before storing
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+ movq [rdi + rdx*2], xmm2
+ movq [rdi + rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_idct_dequant_dc_full_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; short *dc - 4
+; )
+globalsym(vp8_idct_dequant_dc_full_2x_sse2)
+sym(vp8_idct_dequant_dc_full_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+ mov rdx, arg(1) ; dequant
+
+ mov rdi, arg(2) ; dst
+
+ ; Zero out xmm7, for use unpacking
+ pxor xmm7, xmm7
+
+
+ ; note the transpose of xmm1 and xmm2, necessary for shuffle
+ ; to spit out sensicle data
+ movdqa xmm0, [rax]
+ movdqa xmm2, [rax+16]
+ movdqa xmm1, [rax+32]
+ movdqa xmm3, [rax+48]
+
+ ; Clear out coeffs
+ movdqa [rax], xmm7
+ movdqa [rax+16], xmm7
+ movdqa [rax+32], xmm7
+ movdqa [rax+48], xmm7
+
+ ; dequantize qcoeff buffer
+ pmullw xmm0, [rdx]
+ pmullw xmm2, [rdx+16]
+ pmullw xmm1, [rdx]
+ pmullw xmm3, [rdx+16]
+
+ ; DC component
+ mov rdx, arg(4)
+
+ ; repack so block 0 row x and block 1 row x are together
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm4, xmm1
+
+ pshufd xmm0, xmm0, 11011000b
+ pshufd xmm1, xmm4, 11011000b
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ pshufd xmm2, xmm2, 11011000b
+ pshufd xmm3, xmm4, 11011000b
+
+ ; insert DC component
+ pinsrw xmm0, [rdx], 0
+ pinsrw xmm0, [rdx+2], 4
+
+ ; first pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2 ;
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+
+ ; transpose for the second pass
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ ; second pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ paddw xmm0, [GLOBAL(fours)]
+
+ paddw xmm2, [GLOBAL(fours)]
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+ psraw xmm2, 3
+
+ psraw xmm0, 3
+ psraw xmm4, 3
+
+ psraw xmm6, 3
+
+ ; transpose to save
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ pxor xmm7, xmm7
+
+ ; Load up predict blocks
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+ movq xmm4, [rdi]
+ movq xmm5, [rdi+rdx]
+ lea rcx, [rdx + rdx*2]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movq xmm4, [rdi+rdx*2]
+ movq xmm5, [rdi+rcx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+.finish:
+
+ ; pack up before storing
+ packuswb xmm0, xmm7
+ packuswb xmm1, xmm7
+ packuswb xmm2, xmm7
+ packuswb xmm3, xmm7
+
+ ; Load destination stride before writing out,
+ ; doesn't need to persist
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm2
+ movq [rdi + rdx], xmm3
+
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+fours:
+ times 8 dw 0x0004
+align 16
+x_s1sqr2:
+ times 8 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 8 dw 0x4E7B
diff --git a/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm
new file mode 100644
index 0000000000..56f37c3e0f
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -0,0 +1,123 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff)
+globalsym(vp8_short_inv_walsh4x4_sse2)
+sym(vp8_short_inv_walsh4x4_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ ; end prolog
+
+ mov rcx, arg(0)
+ mov rdx, arg(1)
+ mov rax, 30003h
+
+ movdqa xmm0, [rcx + 0] ;ip[4] ip[0]
+ movdqa xmm1, [rcx + 16] ;ip[12] ip[8]
+
+
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm0 ;ip[4] ip[0]
+
+ paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm4, xmm0
+ punpcklqdq xmm0, xmm3 ;d1 a1
+ punpckhqdq xmm4, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm4 ;c1 b1
+ paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ; 13 12 11 10 03 02 01 00
+ ;
+ ; 33 32 31 30 23 22 21 20
+ ;
+ movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ movd xmm0, eax
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm4 ;ip[4] ip[0]
+
+ pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03
+
+ paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm3 ;d1 a1
+ punpckhqdq xmm5, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm5 ;c1 b1
+ paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ paddw xmm5, xmm0
+ paddw xmm4, xmm0
+ psraw xmm5, 3
+ psraw xmm4, 3
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*0], ax
+ mov word ptr[rdx+32*2], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*4], ax
+ mov word ptr[rdx+32*6], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*8], ax
+ mov word ptr[rdx+32*10], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*12], ax
+ mov word ptr[rdx+32*14], cx
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*1], ax
+ mov word ptr[rdx+32*3], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*5], ax
+ mov word ptr[rdx+32*7], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ mov word ptr[rdx+32*9], ax
+ mov word ptr[rdx+32*11], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*13], ax
+ mov word ptr[rdx+32*15], cx
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
new file mode 100644
index 0000000000..8d12f5385d
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
@@ -0,0 +1,817 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro LF_ABS 2
+ ; %1 value not preserved
+ ; %2 value preserved
+ ; output in %1
+ movdqa scratch1, %2 ; v2
+
+ psubusb scratch1, %1 ; v2 - v1
+ psubusb %1, %2 ; v1 - v2
+ por %1, scratch1 ; abs(v2 - v1)
+%endmacro
+
+%macro LF_FILTER_HEV_MASK 8-9
+
+ LF_ABS %1, %2 ; abs(p3 - p2)
+ LF_ABS %2, %3 ; abs(p2 - p1)
+ pmaxub %1, %2 ; accumulate mask
+%if %0 == 8
+ movdqa scratch2, %3 ; save p1
+ LF_ABS scratch2, %4 ; abs(p1 - p0)
+%endif
+ LF_ABS %4, %5 ; abs(p0 - q0)
+ LF_ABS %5, %6 ; abs(q0 - q1)
+%if %0 == 8
+ pmaxub %5, scratch2 ; accumulate hev
+%else
+ pmaxub %5, %9
+%endif
+ pmaxub %1, %5 ; accumulate mask
+
+ LF_ABS %3, %6 ; abs(p1 - q1)
+ LF_ABS %6, %7 ; abs(q1 - q2)
+ pmaxub %1, %6 ; accumulate mask
+ LF_ABS %7, %8 ; abs(q2 - q3)
+ pmaxub %1, %7 ; accumulate mask
+
+ paddusb %4, %4 ; 2 * abs(p0 - q0)
+ pand %3, [GLOBAL(tfe)]
+ psrlw %3, 1 ; abs(p1 - q1) / 2
+ paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+ psubusb %1, [limit]
+ psubusb %4, [blimit]
+ por %1, %4
+ pcmpeqb %1, zero ; mask
+
+ psubusb %5, [thresh]
+ pcmpeqb %5, zero ; ~hev
+%endmacro
+
+%macro LF_FILTER 6
+ ; %1-%4: p1-q1
+ ; %5: mask
+ ; %6: hev
+
+ movdqa scratch2, %6 ; save hev
+
+ pxor %1, [GLOBAL(t80)] ; ps1
+ pxor %4, [GLOBAL(t80)] ; qs1
+ movdqa scratch1, %1
+ psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
+ pandn scratch2, scratch1 ; vp8_filter &= hev
+
+ pxor %2, [GLOBAL(t80)] ; ps0
+ pxor %3, [GLOBAL(t80)] ; qs0
+ movdqa scratch1, %3
+ psubsb scratch1, %2 ; qs0 - ps0
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ pand %5, scratch2 ; &= mask
+
+ movdqa scratch2, %5
+ paddsb %5, [GLOBAL(t4)] ; Filter1
+ paddsb scratch2, [GLOBAL(t3)] ; Filter2
+
+ ; Filter1 >> 3
+ movdqa scratch1, zero
+ pcmpgtb scratch1, %5
+ psrlw %5, 3
+ pand scratch1, [GLOBAL(te0)]
+ pand %5, [GLOBAL(t1f)]
+ por %5, scratch1
+
+ psubsb %3, %5 ; qs0 - Filter1
+ pxor %3, [GLOBAL(t80)]
+
+ ; Filter2 >> 3
+ movdqa scratch1, zero
+ pcmpgtb scratch1, scratch2
+ psrlw scratch2, 3
+ pand scratch1, [GLOBAL(te0)]
+ pand scratch2, [GLOBAL(t1f)]
+ por scratch2, scratch1
+
+ paddsb %2, scratch2 ; ps0 + Filter2
+ pxor %2, [GLOBAL(t80)]
+
+ ; outer tap adjustments
+ paddsb %5, [GLOBAL(t1)]
+ movdqa scratch1, zero
+ pcmpgtb scratch1, %5
+ psrlw %5, 1
+ pand scratch1, [GLOBAL(t80)]
+ pand %5, [GLOBAL(t7f)]
+ por %5, scratch1
+ pand %5, %6 ; vp8_filter &= ~hev
+
+ psubsb %4, %5 ; qs1 - vp8_filter
+ pxor %4, [GLOBAL(t80)]
+
+ paddsb %1, %5 ; ps1 + vp8_filter
+ pxor %1, [GLOBAL(t80)]
+%endmacro
+
+SECTION .text
+
+;void vp8_loop_filter_bh_y_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh
+;)
+globalsym(vp8_loop_filter_bh_y_sse2)
+sym(vp8_loop_filter_bh_y_sse2):
+
+%if LIBVPX_YASM_WIN64
+ %define src rcx ; src_ptr
+ %define stride rdx ; src_pixel_step
+ %define blimit r8
+ %define limit r9
+ %define thresh r10
+
+ %define spp rax
+ %define stride3 r11
+ %define stride5 r12
+ %define stride7 r13
+
+ push rbp
+ mov rbp, rsp
+ SAVE_XMM 11
+ push r12
+ push r13
+ mov thresh, arg(4)
+%else
+ %define src rdi ; src_ptr
+ %define stride rsi ; src_pixel_step
+ %define blimit rdx
+ %define limit rcx
+ %define thresh r8
+
+ %define spp rax
+ %define stride3 r9
+ %define stride5 r10
+ %define stride7 r11
+%endif
+
+ %define scratch1 xmm5
+ %define scratch2 xmm6
+ %define zero xmm7
+
+ %define i0 [src]
+ %define i1 [spp]
+ %define i2 [src + 2 * stride]
+ %define i3 [spp + 2 * stride]
+ %define i4 [src + 4 * stride]
+ %define i5 [spp + 4 * stride]
+ %define i6 [src + 2 * stride3]
+ %define i7 [spp + 2 * stride3]
+ %define i8 [src + 8 * stride]
+ %define i9 [spp + 8 * stride]
+ %define i10 [src + 2 * stride5]
+ %define i11 [spp + 2 * stride5]
+ %define i12 [src + 4 * stride3]
+ %define i13 [spp + 4 * stride3]
+ %define i14 [src + 2 * stride7]
+ %define i15 [spp + 2 * stride7]
+
+ ; prep work
+ lea spp, [src + stride]
+ lea stride3, [stride + 2 * stride]
+ lea stride5, [stride3 + 2 * stride]
+ lea stride7, [stride3 + 4 * stride]
+ pxor zero, zero
+
+ ; load the first set into registers
+ movdqa xmm0, i0
+ movdqa xmm1, i1
+ movdqa xmm2, i2
+ movdqa xmm3, i3
+ movdqa xmm4, i4
+ movdqa xmm8, i5
+ movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
+ movdqa xmm10, i7
+LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
+
+ movdqa xmm1, i2
+ movdqa xmm2, i3
+ movdqa xmm3, i4
+ movdqa xmm8, i5
+LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
+ movdqa i2, xmm1
+ movdqa i3, xmm2
+
+; second set
+ movdqa i4, xmm3
+ movdqa i5, xmm8
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm2, i8
+ movdqa xmm4, i9
+ movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i11
+LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm4, i8
+ movdqa xmm8, i9
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+ movdqa i6, xmm0
+ movdqa i7, xmm1
+
+; last set
+ movdqa i8, xmm4
+ movdqa i9, xmm8
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm2, i12
+ movdqa xmm3, i13
+ movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i15
+LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm3, i12
+ movdqa xmm8, i13
+LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
+ movdqa i10, xmm0
+ movdqa i11, xmm1
+ movdqa i12, xmm3
+ movdqa i13, xmm8
+
+%if LIBVPX_YASM_WIN64
+ pop r13
+ pop r12
+ RESTORE_XMM
+ pop rbp
+%endif
+
+ ret
+
+
+;void vp8_loop_filter_bv_y_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh
+;)
+
+globalsym(vp8_loop_filter_bv_y_sse2)
+sym(vp8_loop_filter_bv_y_sse2):
+
+%if LIBVPX_YASM_WIN64
+ %define src rcx ; src_ptr
+ %define stride rdx ; src_pixel_step
+ %define blimit r8
+ %define limit r9
+ %define thresh r10
+
+ %define spp rax
+ %define stride3 r11
+ %define stride5 r12
+ %define stride7 r13
+
+ push rbp
+ mov rbp, rsp
+ SAVE_XMM 15
+ push r12
+ push r13
+ mov thresh, arg(4)
+%else
+ %define src rdi
+ %define stride rsi
+ %define blimit rdx
+ %define limit rcx
+ %define thresh r8
+
+ %define spp rax
+ %define stride3 r9
+ %define stride5 r10
+ %define stride7 r11
+%endif
+
+ %define scratch1 xmm5
+ %define scratch2 xmm6
+ %define zero xmm7
+
+ %define s0 [src]
+ %define s1 [spp]
+ %define s2 [src + 2 * stride]
+ %define s3 [spp + 2 * stride]
+ %define s4 [src + 4 * stride]
+ %define s5 [spp + 4 * stride]
+ %define s6 [src + 2 * stride3]
+ %define s7 [spp + 2 * stride3]
+ %define s8 [src + 8 * stride]
+ %define s9 [spp + 8 * stride]
+ %define s10 [src + 2 * stride5]
+ %define s11 [spp + 2 * stride5]
+ %define s12 [src + 4 * stride3]
+ %define s13 [spp + 4 * stride3]
+ %define s14 [src + 2 * stride7]
+ %define s15 [spp + 2 * stride7]
+
+ %define i0 [rsp]
+ %define i1 [rsp + 16]
+ %define i2 [rsp + 32]
+ %define i3 [rsp + 48]
+ %define i4 [rsp + 64]
+ %define i5 [rsp + 80]
+ %define i6 [rsp + 96]
+ %define i7 [rsp + 112]
+ %define i8 [rsp + 128]
+ %define i9 [rsp + 144]
+ %define i10 [rsp + 160]
+ %define i11 [rsp + 176]
+ %define i12 [rsp + 192]
+ %define i13 [rsp + 208]
+ %define i14 [rsp + 224]
+ %define i15 [rsp + 240]
+
+ ALIGN_STACK 16, rax
+
+ ; reserve stack space
+ %define temp_storage 0 ; size is 256 (16*16)
+ %define stack_size 256
+ sub rsp, stack_size
+
+ ; prep work
+ lea spp, [src + stride]
+ lea stride3, [stride + 2 * stride]
+ lea stride5, [stride3 + 2 * stride]
+ lea stride7, [stride3 + 4 * stride]
+
+ ; 8-f
+ movdqa xmm0, s8
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, s9 ; 80 90
+ punpckhbw xmm1, s9 ; 88 98
+
+ movdqa xmm2, s10
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, s11 ; a0 b0
+ punpckhbw xmm3, s11 ; a8 b8
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 80 90 a0 b0
+ punpckhwd xmm4, xmm2 ; 84 94 a4 b4
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 88 98 a8 b8
+ punpckhwd xmm2, xmm3 ; 8c 9c ac bc
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, s12
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, s13 ; c0 d0
+ punpckhbw xmm5, s13 ; c8 d8
+
+ movdqa xmm6, s14
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, s15 ; e0 f0
+ punpckhbw xmm7, s15 ; e8 f8
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
+ punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
+ punpckhwd xmm6, xmm7 ; cc dc ec fc
+
+ ; pull the third and fourth sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
+ punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
+ punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
+ punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
+ punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
+
+ ; save the calculations. we only have 15 registers ...
+ movdqa i0, xmm0
+ movdqa i1, xmm7
+ movdqa i2, xmm4
+ movdqa i3, xmm3
+ movdqa i4, xmm1
+ movdqa i5, xmm8
+ movdqa i6, xmm2
+ movdqa i7, xmm5
+
+ ; 0-7
+ movdqa xmm0, s0
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, s1 ; 00 10
+ punpckhbw xmm1, s1 ; 08 18
+
+ movdqa xmm2, s2
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, s3 ; 20 30
+ punpckhbw xmm3, s3 ; 28 38
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 00 10 20 30
+ punpckhwd xmm4, xmm2 ; 04 14 24 34
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 08 18 28 38
+ punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, s4
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, s5 ; 40 50
+ punpckhbw xmm5, s5 ; 48 58
+
+ movdqa xmm6, s6
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, s7 ; 60 70
+ punpckhbw xmm7, s7 ; 68 78
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; 40 50 60 70
+ punpckhwd xmm8, xmm6 ; 44 54 64 74
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; 48 58 68 78
+ punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
+
+ ; pull the first two sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
+ punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
+ punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
+ punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
+ punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
+ ; final combination
+
+ movdqa xmm6, xmm0
+ punpcklqdq xmm0, i0
+ punpckhqdq xmm6, i0
+
+ movdqa xmm9, xmm7
+ punpcklqdq xmm7, i1
+ punpckhqdq xmm9, i1
+
+ movdqa xmm10, xmm4
+ punpcklqdq xmm4, i2
+ punpckhqdq xmm10, i2
+
+ movdqa xmm11, xmm3
+ punpcklqdq xmm3, i3
+ punpckhqdq xmm11, i3
+
+ movdqa xmm12, xmm1
+ punpcklqdq xmm1, i4
+ punpckhqdq xmm12, i4
+
+ movdqa xmm13, xmm8
+ punpcklqdq xmm8, i5
+ punpckhqdq xmm13, i5
+
+ movdqa xmm14, xmm2
+ punpcklqdq xmm2, i6
+ punpckhqdq xmm14, i6
+
+ movdqa xmm15, xmm5
+ punpcklqdq xmm5, i7
+ punpckhqdq xmm15, i7
+
+ movdqa i0, xmm0
+ movdqa i1, xmm6
+ movdqa i2, xmm7
+ movdqa i3, xmm9
+ movdqa i4, xmm4
+ movdqa i5, xmm10
+ movdqa i6, xmm3
+ movdqa i7, xmm11
+ movdqa i8, xmm1
+ movdqa i9, xmm12
+ movdqa i10, xmm8
+ movdqa i11, xmm13
+ movdqa i12, xmm2
+ movdqa i13, xmm14
+ movdqa i14, xmm5
+ movdqa i15, xmm15
+
+; TRANSPOSED DATA AVAILABLE ON THE STACK
+
+ movdqa xmm12, xmm6
+ movdqa xmm13, xmm7
+
+ pxor zero, zero
+
+LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
+
+ movdqa xmm1, i2
+ movdqa xmm2, i3
+ movdqa xmm8, i4
+ movdqa xmm9, i5
+LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
+ movdqa i2, xmm1
+ movdqa i3, xmm2
+
+; second set
+ movdqa i4, xmm8
+ movdqa i5, xmm9
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm2, i8
+ movdqa xmm4, i9
+ movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i11
+LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm3, i8
+ movdqa xmm4, i9
+LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
+ movdqa i6, xmm0
+ movdqa i7, xmm1
+
+; last set
+ movdqa i8, xmm3
+ movdqa i9, xmm4
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm2, i12
+ movdqa xmm8, i13
+ movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i15
+LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm4, i12
+ movdqa xmm8, i13
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+ movdqa i10, xmm0
+ movdqa i11, xmm1
+ movdqa i12, xmm4
+ movdqa i13, xmm8
+
+
+; RESHUFFLE AND WRITE OUT
+ ; 8-f
+ movdqa xmm0, i8
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, i9 ; 80 90
+ punpckhbw xmm1, i9 ; 88 98
+
+ movdqa xmm2, i10
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, i11 ; a0 b0
+ punpckhbw xmm3, i11 ; a8 b8
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 80 90 a0 b0
+ punpckhwd xmm4, xmm2 ; 84 94 a4 b4
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 88 98 a8 b8
+ punpckhwd xmm2, xmm3 ; 8c 9c ac bc
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, i12
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, i13 ; c0 d0
+ punpckhbw xmm5, i13 ; c8 d8
+
+ movdqa xmm6, i14
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, i15 ; e0 f0
+ punpckhbw xmm7, i15 ; e8 f8
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
+ punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
+ punpckhwd xmm6, xmm7 ; cc dc ec fc
+
+ ; pull the third and fourth sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
+ punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
+ punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
+ punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
+ punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
+
+ ; save the calculations. we only have 15 registers ...
+ movdqa i8, xmm0
+ movdqa i9, xmm7
+ movdqa i10, xmm4
+ movdqa i11, xmm3
+ movdqa i12, xmm1
+ movdqa i13, xmm8
+ movdqa i14, xmm2
+ movdqa i15, xmm5
+
+ ; 0-7
+ movdqa xmm0, i0
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, i1 ; 00 10
+ punpckhbw xmm1, i1 ; 08 18
+
+ movdqa xmm2, i2
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, i3 ; 20 30
+ punpckhbw xmm3, i3 ; 28 38
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 00 10 20 30
+ punpckhwd xmm4, xmm2 ; 04 14 24 34
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 08 18 28 38
+ punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, i4
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, i5 ; 40 50
+ punpckhbw xmm5, i5 ; 48 58
+
+ movdqa xmm6, i6
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, i7 ; 60 70
+ punpckhbw xmm7, i7 ; 68 78
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; 40 50 60 70
+ punpckhwd xmm8, xmm6 ; 44 54 64 74
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; 48 58 68 78
+ punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
+
+ ; pull the first two sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
+ punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
+ punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
+ punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
+ punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
+ ; final combination
+
+ movdqa xmm6, xmm0
+ punpcklqdq xmm0, i8
+ punpckhqdq xmm6, i8
+
+ movdqa xmm9, xmm7
+ punpcklqdq xmm7, i9
+ punpckhqdq xmm9, i9
+
+ movdqa xmm10, xmm4
+ punpcklqdq xmm4, i10
+ punpckhqdq xmm10, i10
+
+ movdqa xmm11, xmm3
+ punpcklqdq xmm3, i11
+ punpckhqdq xmm11, i11
+
+ movdqa xmm12, xmm1
+ punpcklqdq xmm1, i12
+ punpckhqdq xmm12, i12
+
+ movdqa xmm13, xmm8
+ punpcklqdq xmm8, i13
+ punpckhqdq xmm13, i13
+
+ movdqa xmm14, xmm2
+ punpcklqdq xmm2, i14
+ punpckhqdq xmm14, i14
+
+ movdqa xmm15, xmm5
+ punpcklqdq xmm5, i15
+ punpckhqdq xmm15, i15
+
+ movdqa s0, xmm0
+ movdqa s1, xmm6
+ movdqa s2, xmm7
+ movdqa s3, xmm9
+ movdqa s4, xmm4
+ movdqa s5, xmm10
+ movdqa s6, xmm3
+ movdqa s7, xmm11
+ movdqa s8, xmm1
+ movdqa s9, xmm12
+ movdqa s10, xmm8
+ movdqa s11, xmm13
+ movdqa s12, xmm2
+ movdqa s13, xmm14
+ movdqa s14, xmm5
+ movdqa s15, xmm15
+
+ ; free stack space
+ add rsp, stack_size
+
+ ; un-ALIGN_STACK
+ pop rsp
+
+%if LIBVPX_YASM_WIN64
+ pop r13
+ pop r12
+ RESTORE_XMM
+ pop rbp
+%endif
+
+ ret
+
+SECTION_RODATA
+align 16
+te0:
+ times 16 db 0xe0
+align 16
+t7f:
+ times 16 db 0x7f
+align 16
+tfe:
+ times 16 db 0xfe
+align 16
+t1f:
+ times 16 db 0x1f
+align 16
+t80:
+ times 16 db 0x80
+align 16
+t1:
+ times 16 db 0x01
+align 16
+t3:
+ times 16 db 0x03
+align 16
+t4:
+ times 16 db 0x04
diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm
new file mode 100644
index 0000000000..ce5c313138
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -0,0 +1,1642 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%define _t0 0
+%define _t1 _t0 + 16
+%define _p3 _t1 + 16
+%define _p2 _p3 + 16
+%define _p1 _p2 + 16
+%define _p0 _p1 + 16
+%define _q0 _p0 + 16
+%define _q1 _q0 + 16
+%define _q2 _q1 + 16
+%define _q3 _q2 + 16
+%define lf_var_size 160
+
+; Use of pmaxub instead of psubusb to compute filter mask was seen
+; in ffvp8
+
+%macro LFH_FILTER_AND_HEV_MASK 1
+%if %1
+ movdqa xmm2, [rdi+2*rax] ; q3
+ movdqa xmm1, [rsi+2*rax] ; q2
+ movdqa xmm4, [rsi+rax] ; q1
+ movdqa xmm5, [rsi] ; q0
+ neg rax ; negate pitch to deal with above border
+%else
+ movlps xmm2, [rsi + rcx*2] ; q3
+ movlps xmm1, [rsi + rcx] ; q2
+ movlps xmm4, [rsi] ; q1
+ movlps xmm5, [rsi + rax] ; q0
+
+ movhps xmm2, [rdi + rcx*2]
+ movhps xmm1, [rdi + rcx]
+ movhps xmm4, [rdi]
+ movhps xmm5, [rdi + rax]
+
+ lea rsi, [rsi + rax*4]
+ lea rdi, [rdi + rax*4]
+
+ movdqa [rsp+_q2], xmm1 ; store q2
+ movdqa [rsp+_q1], xmm4 ; store q1
+%endif
+ movdqa xmm7, [rdx] ;limit
+
+ movdqa xmm6, xmm1 ; q2
+ movdqa xmm3, xmm4 ; q1
+
+ psubusb xmm1, xmm2 ; q2-=q3
+ psubusb xmm2, xmm6 ; q3-=q2
+
+ psubusb xmm4, xmm6 ; q1-=q2
+ psubusb xmm6, xmm3 ; q2-=q1
+
+ por xmm4, xmm6 ; abs(q2-q1)
+ por xmm1, xmm2 ; abs(q3-q2)
+
+ movdqa xmm0, xmm5 ; q0
+ pmaxub xmm1, xmm4
+
+ psubusb xmm5, xmm3 ; q0-=q1
+ psubusb xmm3, xmm0 ; q1-=q0
+
+ por xmm5, xmm3 ; abs(q0-q1)
+ movdqa [rsp+_t0], xmm5 ; save to t0
+
+ pmaxub xmm1, xmm5
+
+%if %1
+ movdqa xmm2, [rsi+4*rax] ; p3
+ movdqa xmm4, [rdi+4*rax] ; p2
+ movdqa xmm6, [rsi+2*rax] ; p1
+%else
+ movlps xmm2, [rsi + rax] ; p3
+ movlps xmm4, [rsi] ; p2
+ movlps xmm6, [rsi + rcx] ; p1
+
+ movhps xmm2, [rdi + rax]
+ movhps xmm4, [rdi]
+ movhps xmm6, [rdi + rcx]
+
+ movdqa [rsp+_p2], xmm4 ; store p2
+ movdqa [rsp+_p1], xmm6 ; store p1
+%endif
+
+ movdqa xmm5, xmm4 ; p2
+ movdqa xmm3, xmm6 ; p1
+
+ psubusb xmm4, xmm2 ; p2-=p3
+ psubusb xmm2, xmm5 ; p3-=p2
+
+ psubusb xmm3, xmm5 ; p1-=p2
+ pmaxub xmm1, xmm4 ; abs(p3 - p2)
+
+ psubusb xmm5, xmm6 ; p2-=p1
+ pmaxub xmm1, xmm2 ; abs(p3 - p2)
+
+ pmaxub xmm1, xmm5 ; abs(p2 - p1)
+ movdqa xmm2, xmm6 ; p1
+
+ pmaxub xmm1, xmm3 ; abs(p2 - p1)
+%if %1
+ movdqa xmm4, [rsi+rax] ; p0
+ movdqa xmm3, [rdi] ; q1
+%else
+ movlps xmm4, [rsi + rcx*2] ; p0
+ movhps xmm4, [rdi + rcx*2]
+ movdqa xmm3, [rsp+_q1] ; q1
+%endif
+
+ movdqa xmm5, xmm4 ; p0
+ psubusb xmm4, xmm6 ; p0-=p1
+
+ psubusb xmm6, xmm5 ; p1-=p0
+
+ por xmm6, xmm4 ; abs(p1 - p0)
+ mov rdx, arg(2) ; get blimit
+
+ movdqa [rsp+_t1], xmm6 ; save to t1
+
+ movdqa xmm4, xmm3 ; q1
+ pmaxub xmm1, xmm6
+
+ psubusb xmm3, xmm2 ; q1-=p1
+ psubusb xmm2, xmm4 ; p1-=q1
+
+ psubusb xmm1, xmm7
+ por xmm2, xmm3 ; abs(p1-q1)
+
+ movdqa xmm7, [rdx] ; blimit
+ mov rdx, arg(4) ; hev get thresh
+
+ movdqa xmm3, xmm0 ; q0
+ pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+
+ movdqa xmm6, xmm5 ; p0
+ psrlw xmm2, 1 ; abs(p1-q1)/2
+
+ psubusb xmm5, xmm3 ; p0-=q0
+ psubusb xmm3, xmm6 ; q0-=p0
+ por xmm5, xmm3 ; abs(p0 - q0)
+
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+
+ movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0)
+ movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0)
+
+ paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ movdqa xmm2, [rdx] ; hev
+
+ psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ psubusb xmm4, xmm2 ; hev
+
+ psubusb xmm3, xmm2 ; hev
+ por xmm1, xmm5
+
+ pxor xmm7, xmm7
+ paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb xmm4, xmm5 ; hev
+ pcmpeqb xmm3, xmm3 ; hev
+
+ pcmpeqb xmm1, xmm7 ; mask xmm1
+ pxor xmm4, xmm3 ; hev
+%endmacro
+
+%macro B_FILTER 1
+ movdqa xmm3, [GLOBAL(t80)]
+%if %1 == 0
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm7, [rsp+_q1] ; q1
+%elif %1 == 1
+ movdqa xmm2, [rsi+2*rax] ; p1
+ movdqa xmm7, [rdi] ; q1
+%elif %1 == 2
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm6, [rsp+_p0] ; p0
+ movdqa xmm0, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm2, xmm3 ; p1 offset to convert to signed values
+ pxor xmm7, xmm3 ; q1 offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+ pxor xmm6, xmm3 ; offset to convert to signed values
+
+ pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
+ pxor xmm0, xmm3 ; offset to convert to signed values
+
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand xmm1, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm2, xmm1
+ paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+ paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+ punpckhbw xmm5, xmm2 ; axbxcxdx
+ punpcklbw xmm2, xmm2 ; exfxgxhx
+
+ punpcklbw xmm0, xmm1 ; exfxgxhx
+ psraw xmm5, 11 ; sign extended shift right by 3
+
+ punpckhbw xmm1, xmm1 ; axbxcxdx
+ psraw xmm2, 11 ; sign extended shift right by 3
+
+ packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+ psraw xmm0, 11 ; sign extended shift right by 3
+
+ psraw xmm1, 11 ; sign extended shift right by 3
+ movdqa xmm5, xmm0 ; save results
+
+ packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+
+ paddsb xmm6, xmm2 ; p0+= p0 add
+
+ movdqa xmm2, [GLOBAL(ones)]
+ paddsw xmm5, xmm2
+ paddsw xmm1, xmm2
+ psraw xmm5, 1 ; partial shifted one more time for 2nd tap
+ psraw xmm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+ movdqa xmm2, [GLOBAL(t80)]
+
+%if %1 == 0
+ movdqa xmm1, [rsp+_p1] ; p1
+ lea rsi, [rsi + rcx*2]
+ lea rdi, [rdi + rcx*2]
+%elif %1 == 1
+ movdqa xmm1, [rsi+2*rax] ; p1
+%elif %1 == 2
+ movdqa xmm1, [rsp+_p1] ; p1
+%endif
+
+ pandn xmm4, xmm5 ; high edge variance additive
+ pxor xmm6, xmm2 ; unoffset
+
+ pxor xmm1, xmm2 ; reoffset
+ psubsb xmm3, xmm0 ; q0-= q0 add
+
+ paddsb xmm1, xmm4 ; p1+= p1 add
+ pxor xmm3, xmm2 ; unoffset
+
+ pxor xmm1, xmm2 ; unoffset
+ psubsb xmm7, xmm4 ; q1-= q1 add
+
+ pxor xmm7, xmm2 ; unoffset
+%if %1 == 0
+ movq [rsi], xmm6 ; p0
+ movhps [rdi], xmm6
+ movq [rsi + rax], xmm1 ; p1
+ movhps [rdi + rax], xmm1
+ movq [rsi + rcx], xmm3 ; q0
+ movhps [rdi + rcx], xmm3
+ movq [rsi + rcx*2], xmm7 ; q1
+ movhps [rdi + rcx*2], xmm7
+%elif %1 == 1
+ movdqa [rsi+rax], xmm6 ; write back
+ movdqa [rsi+2*rax], xmm1 ; write back
+ movdqa [rsi], xmm3 ; write back
+ movdqa [rdi], xmm7 ; write back
+%endif
+
+%endmacro
+
+SECTION .text
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+globalsym(vp8_loop_filter_horizontal_edge_sse2)
+sym(vp8_loop_filter_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
+
+ mov rdx, arg(3) ;limit
+
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 1
+ ; filter and write back the result
+ B_FILTER 1
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%endif
+
+;void vp8_loop_filter_horizontal_edge_uv_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+globalsym(vp8_loop_filter_horizontal_edge_uv_sse2)
+sym(vp8_loop_filter_horizontal_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u
+ mov rdi, arg(5) ; v
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+ mov rcx, rax
+ neg rax ; negate pitch to deal with above border
+
+ mov rdx, arg(3) ;limit
+
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 0
+ ; filter and write back the result
+ B_FILTER 0
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+%macro MB_FILTER_AND_WRITEBACK 1
+ movdqa xmm3, [GLOBAL(t80)]
+%if %1 == 0
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm7, [rsp+_q1] ; q1
+%elif %1 == 1
+ movdqa xmm2, [rsi+2*rax] ; p1
+ movdqa xmm7, [rdi] ; q1
+
+ mov rcx, rax
+ neg rcx
+%elif %1 == 2
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm6, [rsp+_p0] ; p0
+ movdqa xmm0, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm2, xmm3 ; p1 offset to convert to signed values
+ pxor xmm7, xmm3 ; q1 offset to convert to signed values
+ pxor xmm6, xmm3 ; offset to convert to signed values
+ pxor xmm0, xmm3 ; offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0)
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
+ pand xmm1, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm2, xmm1 ; vp8_filter
+
+ pand xmm2, xmm4 ; Filter2 = vp8_filter & hev
+ pxor xmm0, xmm0
+
+ pandn xmm4, xmm1 ; vp8_filter&=~hev
+ pxor xmm1, xmm1
+
+ punpcklbw xmm0, xmm4 ; Filter 2 (hi)
+ punpckhbw xmm1, xmm4 ; Filter 2 (lo)
+
+ movdqa xmm5, xmm2
+
+ movdqa xmm4, [GLOBAL(s9)]
+ paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3)
+ paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
+
+ pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9
+ pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9
+
+ punpckhbw xmm7, xmm5 ; axbxcxdx
+ punpcklbw xmm5, xmm5 ; exfxgxhx
+
+ psraw xmm7, 11 ; sign extended shift right by 3
+
+ psraw xmm5, 11 ; sign extended shift right by 3
+ punpckhbw xmm4, xmm2 ; axbxcxdx
+
+ punpcklbw xmm2, xmm2 ; exfxgxhx
+ psraw xmm4, 11 ; sign extended shift right by 3
+
+ packsswb xmm5, xmm7 ; Filter2 >>=3;
+ psraw xmm2, 11 ; sign extended shift right by 3
+
+ packsswb xmm2, xmm4 ; Filter1 >>=3;
+
+ paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
+
+ psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1
+ movdqa xmm7, xmm1
+
+ movdqa xmm4, [GLOBAL(s63)]
+ movdqa xmm5, xmm0
+ movdqa xmm2, xmm5
+ paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63
+ paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63
+ movdqa xmm4, xmm7
+
+ paddw xmm5, xmm5 ; Filter 2 (hi) * 18
+
+ paddw xmm7, xmm7 ; Filter 2 (lo) * 18
+ paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63
+
+ paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63
+ paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63
+ psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
+
+ paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63
+ psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
+ psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
+
+ packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+ psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
+ psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
+ psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
+
+ packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+ packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+ movdqa xmm7, [GLOBAL(t80)]
+
+%if %1 == 0
+ movdqa xmm1, [rsp+_q1] ; q1
+ movdqa xmm4, [rsp+_p1] ; p1
+ lea rsi, [rsi+rcx*2]
+ lea rdi, [rdi+rcx*2]
+
+%elif %1 == 1
+ movdqa xmm1, [rdi] ; q1
+ movdqa xmm4, [rsi+rax*2] ; p1
+%elif %1 == 2
+ movdqa xmm4, [rsp+_p1] ; p1
+ movdqa xmm1, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm1, xmm7
+ pxor xmm4, xmm7
+
+ psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3)
+ paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3)
+ psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2)
+ paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2)
+
+%if %1 == 1
+ movdqa xmm2, [rdi+rax*4] ; p2
+ movdqa xmm5, [rdi+rcx] ; q2
+%else
+ movdqa xmm2, [rsp+_p2] ; p2
+ movdqa xmm5, [rsp+_q2] ; q2
+%endif
+
+ pxor xmm1, xmm7 ; *oq1 = sq^0x80;
+ pxor xmm4, xmm7 ; *op1 = sp^0x80;
+ pxor xmm2, xmm7
+ pxor xmm5, xmm7
+ paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u)
+ psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u)
+ pxor xmm2, xmm7 ; *op2 = sp^0x80;
+ pxor xmm5, xmm7 ; *oq2 = sq^0x80;
+ pxor xmm3, xmm7 ; *oq0 = sq^0x80
+ pxor xmm6, xmm7 ; *oq0 = sp^0x80
+%if %1 == 0
+ movq [rsi], xmm6 ; p0
+ movhps [rdi], xmm6
+ movq [rsi + rcx], xmm3 ; q0
+ movhps [rdi + rcx], xmm3
+ lea rdx, [rcx + rcx*2]
+ movq [rsi+rcx*2], xmm1 ; q1
+ movhps [rdi+rcx*2], xmm1
+
+ movq [rsi + rax], xmm4 ; p1
+ movhps [rdi + rax], xmm4
+
+ movq [rsi+rax*2], xmm2 ; p2
+ movhps [rdi+rax*2], xmm2
+
+ movq [rsi+rdx], xmm5 ; q2
+ movhps [rdi+rdx], xmm5
+%elif %1 == 1
+ movdqa [rdi+rcx], xmm5 ; q2
+ movdqa [rdi], xmm1 ; q1
+ movdqa [rsi], xmm3 ; q0
+ movdqa [rsi+rax ], xmm6 ; p0
+ movdqa [rsi+rax*2], xmm4 ; p1
+ movdqa [rdi+rax*4], xmm2 ; p2
+%elif %1 == 2
+ movdqa [rsp+_p1], xmm4 ; p1
+ movdqa [rsp+_p0], xmm6 ; p0
+ movdqa [rsp+_q0], xmm3 ; q0
+ movdqa [rsp+_q1], xmm1 ; q1
+%endif
+
+%endmacro
+
+
+;void vp8_mbloop_filter_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+globalsym(vp8_mbloop_filter_horizontal_edge_sse2)
+sym(vp8_mbloop_filter_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
+ mov rdx, arg(3) ;limit
+
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 1
+ ; filter and write back the results
+ MB_FILTER_AND_WRITEBACK 1
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+globalsym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
+sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u
+ mov rdi, arg(5) ; v
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+ mov rcx, rax
+ neg rax ; negate pitch to deal with above border
+ mov rdx, arg(3) ;limit
+
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 0
+ ; filter and write back the results
+ MB_FILTER_AND_WRITEBACK 0
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+%macro TRANSPOSE_16X8 2
+ movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+ movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+ movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+ movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+ movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
+ movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
+
+ punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+
+ movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+
+ movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+ punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
+
+ movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+
+ punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+%if %1
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+%else
+ mov rsi, arg(5) ; v_ptr
+%endif
+
+ movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+ punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+ punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+ punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+ punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+
+%if %1 == 0
+ lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
+ lea rsi, [rsi - 4]
+%endif
+
+ movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+
+ movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+ punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+ punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+ punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+
+ movdqa [rsp+_t0], xmm2 ; save to free XMM2
+
+ movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+ movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+ movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+ movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+ movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+
+ punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+ movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+
+ punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+
+ movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+
+ punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
+
+ movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+
+ punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+
+ movdqa xmm6, xmm1 ;
+ punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+
+ punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+ movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+ punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+
+ punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ movdqa xmm0, xmm5
+ punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+ punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+ movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+
+ punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+ movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+ punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+
+ punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+
+%if %2 == 0
+ movdqa [rsp+_q3], xmm7 ; save 7
+ movdqa [rsp+_q2], xmm6 ; save 6
+%endif
+ movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+ punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ movdqa [rsp+_p1], xmm2 ; save 2
+
+ movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+ punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ movdqa [rsp+_p0], xmm3 ; save 3
+
+ punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+ movdqa [rsp+_q0], xmm4 ; save 4
+ movdqa [rsp+_q1], xmm5 ; save 5
+ movdqa xmm1, [rsp+_t0]
+
+ movdqa xmm2, xmm1 ;
+ punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+%if %2 == 0
+ movdqa [rsp+_p2], xmm1
+ movdqa [rsp+_p3], xmm2
+%endif
+
+%endmacro
+
+%macro LFV_FILTER_MASK_HEV_MASK 0
+ movdqa xmm0, xmm6 ; q2
+ psubusb xmm0, xmm7 ; q2-q3
+
+ psubusb xmm7, xmm6 ; q3-q2
+ movdqa xmm4, xmm5 ; q1
+
+ por xmm7, xmm0 ; abs (q3-q2)
+ psubusb xmm4, xmm6 ; q1-q2
+
+ movdqa xmm0, xmm1
+ psubusb xmm6, xmm5 ; q2-q1
+
+ por xmm6, xmm4 ; abs (q2-q1)
+ psubusb xmm0, xmm2 ; p2 - p3;
+
+ psubusb xmm2, xmm1 ; p3 - p2;
+ por xmm0, xmm2 ; abs(p2-p3)
+
+ movdqa xmm5, [rsp+_p1] ; p1
+ pmaxub xmm0, xmm7
+
+ movdqa xmm2, xmm5 ; p1
+ psubusb xmm5, xmm1 ; p1-p2
+ psubusb xmm1, xmm2 ; p2-p1
+
+ movdqa xmm7, xmm3 ; p0
+ psubusb xmm7, xmm2 ; p0-p1
+
+ por xmm1, xmm5 ; abs(p2-p1)
+ pmaxub xmm0, xmm6
+
+ pmaxub xmm0, xmm1
+ movdqa xmm1, xmm2 ; p1
+
+ psubusb xmm2, xmm3 ; p1-p0
+
+ por xmm2, xmm7 ; abs(p1-p0)
+
+ pmaxub xmm0, xmm2
+
+ movdqa xmm5, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+
+ mov rdx, arg(3) ; limit
+
+ movdqa xmm6, xmm5 ; q0
+ movdqa xmm4, xmm7 ; q1
+
+ psubusb xmm5, xmm7 ; q0-q1
+ psubusb xmm7, xmm6 ; q1-q0
+
+ por xmm7, xmm5 ; abs(q1-q0)
+
+ pmaxub xmm0, xmm7
+
+ psubusb xmm0, [rdx] ; limit
+
+ mov rdx, arg(2) ; blimit
+ movdqa xmm5, xmm4 ; q1
+
+ psubusb xmm5, xmm1 ; q1-=p1
+ psubusb xmm1, xmm4 ; p1-=q1
+
+ por xmm5, xmm1 ; abs(p1-q1)
+ movdqa xmm1, xmm3 ; p0
+
+ pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psubusb xmm1, xmm6 ; p0-q0
+
+ movdqa xmm4, [rdx] ; blimit
+ mov rdx, arg(4) ; get thresh
+
+ psrlw xmm5, 1 ; abs(p1-q1)/2
+ psubusb xmm6, xmm3 ; q0-p0
+
+ por xmm1, xmm6 ; abs(q0-p0)
+ paddusb xmm1, xmm1 ; abs(q0-p0)*2
+ movdqa xmm3, [rdx]
+
+ paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+ psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh
+
+ psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh
+
+ psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ por xmm1, xmm0 ; mask
+ pcmpeqb xmm2, xmm0
+
+ pxor xmm0, xmm0
+ pcmpeqb xmm4, xmm4
+
+ pcmpeqb xmm1, xmm0
+ pxor xmm4, xmm2
+%endmacro
+
+%macro BV_TRANSPOSE 0
+ ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+ movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+
+ movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+ punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+ movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+
+ punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+
+ punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+ ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+ ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+ ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+%endmacro
+
+%macro BV_WRITEBACK 2
+ movd [rsi+2], %1
+ movd [rsi+4*rax+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rdi+2], %1
+ movd [rdi+4*rax+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rsi+2*rax+2], %1
+ movd [rsi+2*rcx+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rdi+2*rax+2], %1
+ movd [rdi+2*rcx+2], %2
+%endmacro
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+globalsym(vp8_loop_filter_vertical_edge_sse2)
+sym(vp8_loop_filter_vertical_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; src_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax*2+rax]
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8 1, 1
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ B_FILTER 2
+
+ ; transpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+ ; store 16-line result
+
+ lea rdx, [rax]
+ neg rdx
+
+ BV_WRITEBACK xmm1, xmm5
+
+ lea rsi, [rsi+rdx*8]
+ lea rdi, [rdi+rdx*8]
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%endif
+
+;void vp8_loop_filter_vertical_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+globalsym(vp8_loop_filter_vertical_edge_uv_sse2)
+sym(vp8_loop_filter_vertical_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax+2*rax]
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8 0, 1
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ B_FILTER 2
+
+ ; transpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+
+ ; store 16-line result
+ BV_WRITEBACK xmm1, xmm5
+
+ mov rsi, arg(0) ; u_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%macro MBV_TRANSPOSE 0
+ movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+ punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+ punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+ punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+ movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+ movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+ movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+ punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+
+ movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+ punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+
+ punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
+ movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+ punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
+ punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+%endmacro
+
+%macro MBV_WRITEBACK_1 0
+ movq [rsi], xmm0
+ movhps [rdi], xmm0
+
+ movq [rsi+2*rax], xmm6
+ movhps [rdi+2*rax], xmm6
+
+ movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+ punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
+ punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
+
+ movq [rsi+4*rax], xmm0
+ movhps [rdi+4*rax], xmm0
+
+ movq [rsi+2*rcx], xmm3
+ movhps [rdi+2*rcx], xmm3
+
+ movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+ punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
+
+ movdqa xmm0, xmm7
+ punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
+ punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
+
+ movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
+ punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
+%endmacro
+
+%macro MBV_WRITEBACK_2 0
+ movq [rsi], xmm1
+ movhps [rdi], xmm1
+
+ movq [rsi+2*rax], xmm5
+ movhps [rdi+2*rax], xmm5
+
+ movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+ punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
+ punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
+
+ movq [rsi+4*rax], xmm1
+ movhps [rdi+4*rax], xmm1
+
+ movq [rsi+2*rcx], xmm4
+ movhps [rdi+2*rcx], xmm4
+%endmacro
+
+
+;void vp8_mbloop_filter_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+globalsym(vp8_mbloop_filter_vertical_edge_sse2)
+sym(vp8_mbloop_filter_vertical_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; src_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax*2+rax]
+
+ ; Transpose
+ TRANSPOSE_16X8 1, 0
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ neg rax
+ ; start work on filters
+ MB_FILTER_AND_WRITEBACK 2
+
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+
+ ; transpose and write back
+ MBV_TRANSPOSE
+
+ neg rax
+
+ MBV_WRITEBACK_1
+
+
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+ MBV_WRITEBACK_2
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_vertical_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+globalsym(vp8_mbloop_filter_vertical_edge_uv_sse2)
+sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax+2*rax]
+
+ ; Transpose
+ TRANSPOSE_16X8 0, 0
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ MB_FILTER_AND_WRITEBACK 2
+
+ ; transpose and write back
+ MBV_TRANSPOSE
+
+ mov rsi, arg(0) ;u_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax]
+ MBV_WRITEBACK_1
+ mov rsi, arg(5) ;v_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax]
+ MBV_WRITEBACK_2
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+;)
+globalsym(vp8_loop_filter_simple_horizontal_edge_sse2)
+sym(vp8_loop_filter_simple_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx
+ ; end prolog
+
+ mov rcx, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+ movdqa xmm6, [GLOBAL(tfe)]
+ lea rdx, [rcx + rax]
+ neg rax
+
+ ; calculate mask
+ movdqa xmm0, [rdx] ; q1
+ mov rdx, arg(2) ;blimit
+ movdqa xmm1, [rcx+2*rax] ; p1
+
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm0
+
+ psubusb xmm0, xmm1 ; q1-=p1
+ psubusb xmm1, xmm3 ; p1-=q1
+ por xmm1, xmm0 ; abs(p1-q1)
+ pand xmm1, xmm6 ; set lsb of each byte to zero
+ psrlw xmm1, 1 ; abs(p1-q1)/2
+
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ movdqa xmm5, [rcx+rax] ; p0
+ movdqa xmm4, [rcx] ; q0
+ movdqa xmm0, xmm4 ; q0
+ movdqa xmm6, xmm5 ; p0
+ psubusb xmm5, xmm4 ; p0-=q0
+ psubusb xmm4, xmm6 ; q0-=p0
+ por xmm5, xmm4 ; abs(p0 - q0)
+
+ movdqa xmm4, [GLOBAL(t80)]
+
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+ paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+ psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor xmm7, xmm7
+ pcmpeqb xmm5, xmm7
+
+
+ ; start work on filters
+ pxor xmm2, xmm4 ; p1 offset to convert to signed values
+ pxor xmm3, xmm4 ; q1 offset to convert to signed values
+ psubsb xmm2, xmm3 ; p1 - q1
+
+ pxor xmm6, xmm4 ; offset to convert to signed values
+ pxor xmm0, xmm4 ; offset to convert to signed values
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
+ pand xmm5, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm0, xmm5
+ paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
+ paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
+
+ movdqa xmm1, [GLOBAL(te0)]
+ movdqa xmm2, [GLOBAL(t1f)]
+
+; pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm0 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm0, 3
+ pand xmm0, xmm2 ;clear out upper 3 bits
+ por xmm0, xmm7 ;add sign
+ psubsb xmm3, xmm0 ; q0-= q0sz add
+
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm5 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm5, 3
+ pand xmm5, xmm2 ;clear out upper 3 bits
+ por xmm5, xmm7 ;add sign
+ paddsb xmm6, xmm5 ; p0+= p0 add
+
+ pxor xmm3, xmm4 ; unoffset
+ movdqa [rcx], xmm3 ; write back
+
+ pxor xmm6, xmm4 ; unoffset
+ movdqa [rcx+rax], xmm6 ; write back
+
+ ; begin epilog
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+;)
+globalsym(vp8_loop_filter_simple_vertical_edge_sse2)
+sym(vp8_loop_filter_simple_vertical_edge_sse2):
+ push rbp ; save old base pointer value.
+ mov rbp, rsp ; set new base pointer value.
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx ; save callee-saved reg
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi - 2 ]
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
+ movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
+ movd xmm2, [rdi] ; 13 12 11 10
+ movd xmm3, [rcx] ; 53 52 51 50
+ punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
+ punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
+
+ movd xmm4, [rsi + rax*2] ; 23 22 21 20
+ movd xmm5, [rdx + rax*2] ; 63 62 61 60
+ movd xmm6, [rdi + rax*2] ; 33 32 31 30
+ movd xmm7, [rcx + rax*2] ; 73 72 71 70
+ punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
+ punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
+
+ punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+ punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+ movdqa xmm2, xmm0
+ punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+ punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ lea rsi, [rsi + rax*8]
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd xmm4, [rsi] ; 83 82 81 80
+ movd xmm1, [rdx] ; c3 c2 c1 c0
+ movd xmm6, [rdi] ; 93 92 91 90
+ movd xmm3, [rcx] ; d3 d2 d1 d0
+ punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
+ punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
+
+ movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0
+ movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
+ movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0
+ movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
+ punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
+ punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
+
+ punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
+ punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+
+ movdqa xmm7, xmm4
+ punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+ punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+
+ movdqa xmm6, xmm4
+ punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+ punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+
+ punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+ mov rdx, arg(2) ;blimit
+
+ ; calculate mask
+ movdqa xmm6, xmm0 ; p1
+ movdqa xmm7, xmm3 ; q1
+ psubusb xmm7, xmm0 ; q1-=p1
+ psubusb xmm6, xmm3 ; p1-=q1
+ por xmm6, xmm7 ; abs(p1-q1)
+ pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw xmm6, 1 ; abs(p1-q1)/2
+
+ movdqa xmm7, [rdx]
+
+ movdqa xmm5, xmm1 ; p0
+ movdqa xmm4, xmm2 ; q0
+ psubusb xmm5, xmm2 ; p0-=q0
+ psubusb xmm4, xmm1 ; q0-=p0
+ por xmm5, xmm4 ; abs(p0 - q0)
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+ paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ movdqa xmm4, [GLOBAL(t80)]
+
+ psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor xmm7, xmm7
+ pcmpeqb xmm5, xmm7 ; mm5 = mask
+
+ ; start work on filters
+ movdqa t0, xmm0
+ movdqa t1, xmm3
+
+ pxor xmm0, xmm4 ; p1 offset to convert to signed values
+ pxor xmm3, xmm4 ; q1 offset to convert to signed values
+ psubsb xmm0, xmm3 ; p1 - q1
+
+ pxor xmm1, xmm4 ; offset to convert to signed values
+ pxor xmm2, xmm4 ; offset to convert to signed values
+
+ movdqa xmm3, xmm2 ; offseted ; q0
+ psubsb xmm2, xmm1 ; q0 - p0
+ paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0)
+ pand xmm5, xmm0 ; mask filter values we don't care about
+
+ movdqa xmm0, xmm5
+ paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
+ paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
+
+ movdqa xmm6, [GLOBAL(te0)]
+ movdqa xmm2, [GLOBAL(t1f)]
+
+; pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm0 ;save sign
+ pand xmm7, xmm6 ;preserve the upper 3 bits
+ psrlw xmm0, 3
+ pand xmm0, xmm2 ;clear out upper 3 bits
+ por xmm0, xmm7 ;add sign
+ psubsb xmm3, xmm0 ; q0-= q0sz add
+
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm5 ;save sign
+ pand xmm7, xmm6 ;preserve the upper 3 bits
+ psrlw xmm5, 3
+ pand xmm5, xmm2 ;clear out upper 3 bits
+ por xmm5, xmm7 ;add sign
+ paddsb xmm1, xmm5 ; p0+= p0 add
+
+ pxor xmm3, xmm4 ; unoffset q0
+ pxor xmm1, xmm4 ; unoffset p0
+
+ movdqa xmm0, t0 ; p1
+ movdqa xmm4, t1 ; q1
+
+ ; write out order: xmm0 xmm2 xmm1 xmm3
+ lea rdx, [rsi + rax*4]
+
+ ; transpose back to write out
+ ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ movdqa xmm6, xmm0
+ punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+ punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+
+ movdqa xmm3, xmm6
+ punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+ movd [rsi], xmm6 ; write the second 8-line result
+ movd [rdx], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rdi], xmm6
+ movd [rcx], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rsi + rax*2], xmm6
+ movd [rdx + rax*2], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rdi + rax*2], xmm6
+ movd [rcx + rax*2], xmm3
+
+ neg rax
+ lea rsi, [rsi + rax*8]
+ neg rax
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd [rsi], xmm0 ; write the first 8-line result
+ movd [rdx], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rdi], xmm0
+ movd [rcx], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rsi + rax*2], xmm0
+ movd [rdx + rax*2], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rdi + rax*2], xmm0
+ movd [rcx + rax*2], xmm2
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+tfe:
+ times 16 db 0xfe
+align 16
+t80:
+ times 16 db 0x80
+align 16
+t1s:
+ times 16 db 0x01
+align 16
+t3:
+ times 16 db 0x03
+align 16
+t4:
+ times 16 db 0x04
+align 16
+ones:
+ times 8 dw 0x0001
+align 16
+s9:
+ times 8 dw 0x0900
+align 16
+s63:
+ times 8 dw 0x003f
+align 16
+te0:
+ times 16 db 0xe0
+align 16
+t1f:
+ times 16 db 0x1f
diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c b/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c
new file mode 100644
index 0000000000..cfa13a2ddb
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/common/loopfilter.h"
+
+#define prototype_loopfilter(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit, \
+ const unsigned char *limit, const unsigned char *thresh, int count)
+
+#define prototype_loopfilter_nc(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit, \
+ const unsigned char *limit, const unsigned char *thresh)
+
+#define prototype_simple_loopfilter(sym) \
+ void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+
+#if HAVE_SSE2 && VPX_ARCH_X86_64
+prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
+prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
+#else
+prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2);
+#endif
+prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2);
+
+extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
+
+/* Horizontal MB filtering */
+#if HAVE_SSE2
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr);
+
+ if (u_ptr) {
+ vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, v_ptr);
+ }
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr);
+
+ if (u_ptr) {
+ vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, v_ptr);
+ }
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+#if VPX_ARCH_X86_64
+ vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr,
+ 2);
+#else
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr);
+#endif
+
+ if (u_ptr) {
+ vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr,
+ v_ptr + 4 * uv_stride);
+ }
+}
+
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride,
+ blimit);
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride,
+ blimit);
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride,
+ blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+#if VPX_ARCH_X86_64
+ vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr,
+ 2);
+#else
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr);
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr);
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr);
+#endif
+
+ if (u_ptr) {
+ vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, v_ptr + 4);
+ }
+}
+
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
+}
+
+#endif
diff --git a/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm
new file mode 100644
index 0000000000..3ec2a99ec2
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm
@@ -0,0 +1,289 @@
+;
+; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;void vp8_filter_by_weight16x16_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+globalsym(vp8_filter_by_weight16x16_sse2)
+sym(vp8_filter_by_weight16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 16 ; loop count
+ pxor xmm6, xmm6
+
+.combine:
+ movdqa xmm2, [rax]
+ movdqa xmm4, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm6
+ punpckhbw xmm3, xmm6
+ pmullw xmm2, xmm0
+ pmullw xmm3, xmm0
+
+ ; dst * dst_weight
+ movdqa xmm5, xmm4
+ punpcklbw xmm4, xmm6
+ punpckhbw xmm5, xmm6
+ pmullw xmm4, xmm1
+ pmullw xmm5, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ paddw xmm3, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+ psrlw xmm3, 4
+
+ packuswb xmm2, xmm3
+ movdqa [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp8_filter_by_weight8x8_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+globalsym(vp8_filter_by_weight8x8_sse2)
+sym(vp8_filter_by_weight8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 8 ; loop count
+ pxor xmm4, xmm4
+
+.combine:
+ movq xmm2, [rax]
+ movq xmm3, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ punpcklbw xmm2, xmm4
+ pmullw xmm2, xmm0
+
+ ; dst * dst_weight
+ punpcklbw xmm3, xmm4
+ pmullw xmm3, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm3
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+
+ packuswb xmm2, xmm4
+ movq [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp8_variance_and_sad_16x16_sse2 | arg
+;(
+; unsigned char *src1, 0
+; int stride1, 1
+; unsigned char *src2, 2
+; int stride2, 3
+; unsigned int *variance, 4
+; unsigned int *sad, 5
+;)
+globalsym(vp8_variance_and_sad_16x16_sse2)
+sym(vp8_variance_and_sad_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ; src1
+ mov rcx, arg(1) ; stride1
+ mov rdx, arg(2) ; src2
+ mov rdi, arg(3) ; stride2
+
+ mov rsi, 16 ; block height
+
+ ; Prep accumulator registers
+ pxor xmm3, xmm3 ; SAD
+ pxor xmm4, xmm4 ; sum of src2
+ pxor xmm5, xmm5 ; sum of src2^2
+
+ ; Because we're working with the actual output frames
+ ; we can't depend on any kind of data alignment.
+.accumulate:
+ movdqa xmm0, [rax] ; src1
+ movdqa xmm1, [rdx] ; src2
+ add rax, rcx ; src1 + stride1
+ add rdx, rdi ; src2 + stride2
+
+ ; SAD(src1, src2)
+ psadbw xmm0, xmm1
+ paddusw xmm3, xmm0
+
+ ; SUM(src2)
+ pxor xmm2, xmm2
+ psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
+ paddusw xmm4, xmm2
+
+ ; pmaddubsw would be ideal if it took two unsigned values. instead,
+ ; it expects a signed and an unsigned value. so instead we zero extend
+ ; and operate on words.
+ pxor xmm2, xmm2
+ movdqa xmm0, xmm1
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ paddd xmm5, xmm0
+ paddd xmm5, xmm1
+
+ sub rsi, 1
+ jnz .accumulate
+
+ ; phaddd only operates on adjacent double words.
+ ; Finalize SAD and store
+ movdqa xmm0, xmm3
+ psrldq xmm0, 8
+ paddusw xmm0, xmm3
+ paddd xmm0, [GLOBAL(t128)]
+ psrld xmm0, 8
+
+ mov rax, arg(5)
+ movd [rax], xmm0
+
+ ; Accumulate sum of src2
+ movdqa xmm0, xmm4
+ psrldq xmm0, 8
+ paddusw xmm0, xmm4
+ ; Square src2. Ignore high value
+ pmuludq xmm0, xmm0
+ psrld xmm0, 8
+
+ ; phaddw could be used to sum adjacent values but we want
+ ; all the values summed. promote to doubles, accumulate,
+ ; shift and sum
+ pxor xmm2, xmm2
+ movdqa xmm1, xmm5
+ punpckldq xmm1, xmm2
+ punpckhdq xmm5, xmm2
+ paddd xmm1, xmm5
+ movdqa xmm2, xmm1
+ psrldq xmm1, 8
+ paddd xmm1, xmm2
+
+ psubd xmm1, xmm0
+
+ ; (variance + 128) >> 8
+ paddd xmm1, [GLOBAL(t128)]
+ psrld xmm1, 8
+ mov rax, arg(4)
+
+ movd [rax], xmm1
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+t128:
+%ifndef __NASM_VER__
+ ddq 128
+%elif CONFIG_BIG_ENDIAN
+ dq 0, 128
+%else
+ dq 128, 0
+%endif
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+ times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+ times 8 dw 0x08
+
diff --git a/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm
new file mode 100644
index 0000000000..01cf066837
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm
@@ -0,0 +1,120 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;void copy_mem8x8_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+globalsym(vp8_copy_mem8x8_mmx)
+sym(vp8_copy_mem8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movq mm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movq mm1, [rsi+rax]
+ movq mm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movq [rdi], mm0
+ add rsi, rax
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx*2], mm2
+
+
+ lea rdi, [rdi+rcx*2]
+ movq mm3, [rsi]
+
+ add rdi, rcx
+ movq mm4, [rsi+rax]
+
+ movq mm5, [rsi+rax*2]
+ movq [rdi], mm3
+
+ lea rsi, [rsi+rax*2]
+ movq [rdi+rcx], mm4
+
+ movq [rdi+rcx*2], mm5
+ lea rdi, [rdi+rcx*2]
+
+ movq mm0, [rsi+rax]
+ movq mm1, [rsi+rax*2]
+
+ movq [rdi+rcx], mm0
+ movq [rdi+rcx*2],mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void copy_mem8x4_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+globalsym(vp8_copy_mem8x4_mmx)
+sym(vp8_copy_mem8x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movq mm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movq mm1, [rsi+rax]
+ movq mm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movq [rdi], mm0
+ movq [rdi+rcx], mm1
+
+ movq [rdi+rcx*2], mm2
+ lea rdi, [rdi+rcx*2]
+
+ movq mm3, [rsi+rax]
+ movq [rdi+rcx], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm
new file mode 100644
index 0000000000..17baf094ef
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm
@@ -0,0 +1,118 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;void copy_mem16x16_sse2(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+globalsym(vp8_copy_mem16x16_sse2)
+sym(vp8_copy_mem16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movdqu xmm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movdqu xmm1, [rsi+rax]
+ movdqu xmm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm0
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm1
+ movdqa [rdi+rcx*2],xmm2
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm3, [rsi]
+
+ add rdi, rcx
+ movdqu xmm4, [rsi+rax]
+
+ movdqu xmm5, [rsi+rax*2]
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm3
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm4
+ movdqa [rdi+rcx*2],xmm5
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm0, [rsi]
+
+ add rdi, rcx
+ movdqu xmm1, [rsi+rax]
+
+ movdqu xmm2, [rsi+rax*2]
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm0
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm1
+
+ movdqa [rdi+rcx*2], xmm2
+ movdqu xmm3, [rsi]
+
+ movdqu xmm4, [rsi+rax]
+ lea rdi, [rdi+rcx*2]
+
+ add rdi, rcx
+ movdqu xmm5, [rsi+rax*2]
+
+ lea rsi, [rsi+rax*2]
+ movdqa [rdi], xmm3
+
+ add rsi, rax
+ movdqa [rdi+rcx], xmm4
+
+ movdqa [rdi+rcx*2],xmm5
+ movdqu xmm0, [rsi]
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm1, [rsi+rax]
+
+ add rdi, rcx
+ movdqu xmm2, [rsi+rax*2]
+
+ lea rsi, [rsi+rax*2]
+ movdqa [rdi], xmm0
+
+ movdqa [rdi+rcx], xmm1
+ movdqa [rdi+rcx*2],xmm2
+
+ movdqu xmm3, [rsi+rax]
+ lea rdi, [rdi+rcx*2]
+
+ movdqa [rdi+rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm
new file mode 100644
index 0000000000..8f0f6fcc89
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -0,0 +1,270 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define vp8_filter_weight 128
+%define VP8_FILTER_SHIFT 7
+
+SECTION .text
+
+;void vp8_filter_block1d_h6_mmx
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+globalsym(vp8_filter_block1d_h6_mmx)
+sym(vp8_filter_block1d_h6_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+
+ movq mm1, [rdx + 16] ; do both the negative taps first!!!
+ movq mm2, [rdx + 32] ;
+ movq mm6, [rdx + 48] ;
+ movq mm7, [rdx + 64] ;
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
+ pxor mm0, mm0 ; mm0 = 00000000
+
+.nextrow:
+ movq mm3, [rsi-2] ; mm3 = p-2..p5
+ movq mm4, mm3 ; mm4 = p-2..p5
+ psrlq mm3, 8 ; mm3 = p-1..p5
+ punpcklbw mm3, mm0 ; mm3 = p-1..p2
+ pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
+
+ movq mm5, mm4 ; mm5 = p-2..p5
+ punpckhbw mm4, mm0 ; mm5 = p2..p5
+ pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ movq mm4, mm5 ; mm4 = p-2..p5;
+ psrlq mm5, 16 ; mm5 = p0..p5;
+ punpcklbw mm5, mm0 ; mm5 = p0..p3
+ pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
+ paddsw mm3, mm5 ; mm3 += mm5
+
+ movq mm5, mm4 ; mm5 = p-2..p5
+ psrlq mm4, 24 ; mm4 = p1..p5
+ punpcklbw mm4, mm0 ; mm4 = p1..p4
+ pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ ; do outer positive taps
+ movd mm4, [rsi+3]
+ punpcklbw mm4, mm0 ; mm5 = p3..p6
+ pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ punpcklbw mm5, mm0 ; mm5 = p-2..p1
+ pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
+ paddsw mm3, mm5 ; mm3 += mm5
+
+ paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
+ packuswb mm3, mm0 ; pack and unpack to saturate
+ punpcklbw mm3, mm0 ;
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
+ add rdi, rax;
+%else
+ movsxd r8, dword ptr arg(2) ;src_pixels_per_line
+ add rdi, rax;
+
+ add rsi, r8 ; next line
+%endif
+
+ dec rcx ; decrement count
+ jnz .nextrow ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1dc_v6_mmx
+;(
+; short *src_ptr,
+; unsigned char *output_ptr,
+; int output_pitch,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+globalsym(vp8_filter_block1dc_v6_mmx)
+sym(vp8_filter_block1dc_v6_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movq mm5, [GLOBAL(rd)]
+ push rbx
+ mov rbx, arg(7) ;vp8_filter
+ movq mm1, [rbx + 16] ; do both the negative taps first!!!
+ movq mm2, [rbx + 32] ;
+ movq mm6, [rbx + 48] ;
+ movq mm7, [rbx + 64] ;
+
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+ sub rsi, rdx
+ sub rsi, rdx
+ movsxd rcx, DWORD PTR arg(5) ;output_height
+ movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
+ pxor mm0, mm0 ; mm0 = 00000000
+
+
+.nextrow_cv:
+ movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
+ pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
+
+
+ movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
+ pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
+ pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi] ; mm4 = p0..p3 = row -2
+ pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+
+ add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
+ movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
+ pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
+ pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+
+ paddsw mm3, mm5 ; mm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
+ packuswb mm3, mm0 ; pack and saturate
+
+ movd [rdi],mm3 ; store the results in the destination
+ ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
+ ; recon block should be in cache this shouldn't cost much. Its obviously
+ ; avoidable!!!.
+ lea rdi, [rdi+rax] ;
+ dec rcx ; decrement count
+ jnz .nextrow_cv ; next row
+
+ pop rbx
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+rd:
+ times 4 dw 0x40
+
+align 16
+global HIDDEN_DATA(sym(vp8_six_tap_x86))
+sym(vp8_six_tap_x86):
+ times 8 dw 0
+ times 8 dw 0
+ times 8 dw 128
+ times 8 dw 0
+ times 8 dw 0
+ times 8 dw 0
+
+ times 8 dw 0
+ times 8 dw -6
+ times 8 dw 123
+ times 8 dw 12
+ times 8 dw -1
+ times 8 dw 0
+
+ times 8 dw 2
+ times 8 dw -11
+ times 8 dw 108
+ times 8 dw 36
+ times 8 dw -8
+ times 8 dw 1
+
+ times 8 dw 0
+ times 8 dw -9
+ times 8 dw 93
+ times 8 dw 50
+ times 8 dw -6
+ times 8 dw 0
+
+ times 8 dw 3
+ times 8 dw -16
+ times 8 dw 77
+ times 8 dw 77
+ times 8 dw -16
+ times 8 dw 3
+
+ times 8 dw 0
+ times 8 dw -6
+ times 8 dw 50
+ times 8 dw 93
+ times 8 dw -9
+ times 8 dw 0
+
+ times 8 dw 1
+ times 8 dw -8
+ times 8 dw 36
+ times 8 dw 108
+ times 8 dw -11
+ times 8 dw 2
+
+ times 8 dw 0
+ times 8 dw -1
+ times 8 dw 12
+ times 8 dw 123
+ times 8 dw -6
+ times 8 dw 0
+
+
diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm
new file mode 100644
index 0000000000..94e14aed6c
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -0,0 +1,963 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT 7
+
+SECTION .text
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short *vp8_filter
+;)
+globalsym(vp8_filter_block1d8_h6_sse2)
+sym(vp8_filter_block1d8_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;output_width
+%endif
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d8_h6_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm4
+ lea rsi, [rsi + rax]
+
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(5) ;[output_width]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+
+ jnz .filter_block1d8_h6_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+globalsym(vp8_filter_block1d16_h6_sse2)
+sym(vp8_filter_block1d16_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;output_width
+%endif
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d16_h6_sse2_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ ; Load from 11 to avoid reading out of bounds.
+ movq xmm2, MMWORD PTR [rsi +11]
+ ; The lower bits are not cleared before 'or'ing with xmm1,
+ ; but that is OK because the values in the overlapping positions
+ ; are already equal to the ones in xmm1.
+ pslldq xmm2, 5
+
+ por xmm2, xmm1
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm4
+
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm2
+
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm2
+
+ movdqa xmm7, xmm2
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm2
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi+16], xmm4
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(5) ;[output_width]
+%else
+ add rdi, r8
+%endif
+
+ dec rcx
+ jnz .filter_block1d16_h6_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_v6_sse2
+;(
+; short *src_ptr,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+globalsym(vp8_filter_block1d8_v6_sse2)
+sym(vp8_filter_block1d8_v6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(7) ;vp8_filter
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+
+ sub rsi, rdx
+ sub rsi, rdx
+
+ movsxd rcx, DWORD PTR arg(5) ;[output_height]
+ pxor xmm0, xmm0 ; clear xmm0
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_sse2_loop:
+ movdqa xmm1, XMMWORD PTR [rsi]
+ pmullw xmm1, [rax]
+
+ movdqa xmm2, XMMWORD PTR [rsi + rdx]
+ pmullw xmm2, [rax + 16]
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
+ pmullw xmm3, [rax + 32]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
+ pmullw xmm5, [rax + 64]
+
+ add rsi, rdx
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
+
+ pmullw xmm4, [rax + 48]
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
+
+ pmullw xmm6, [rax + 80]
+
+ paddsw xmm2, xmm5
+ paddsw xmm2, xmm3
+
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+
+ paddsw xmm2, xmm6
+ paddsw xmm2, xmm7
+
+ psraw xmm2, 7
+ packuswb xmm2, xmm0 ; pack and saturate
+
+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(2) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_v6_sse2
+;(
+; unsigned short *src_ptr,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; const short *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+globalsym(vp8_filter_block1d16_v6_sse2)
+sym(vp8_filter_block1d16_v6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(7) ;vp8_filter
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+
+ sub rsi, rdx
+ sub rsi, rdx
+
+ movsxd rcx, DWORD PTR arg(5) ;[output_height]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d16_v6_sse2_loop:
+; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
+ movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
+ movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
+ pmullw xmm1, [rax + 16]
+ pmullw xmm2, [rax + 16]
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
+ pmullw xmm3, [rax + 64]
+ pmullw xmm4, [rax + 64]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
+ pmullw xmm5, [rax + 32]
+ pmullw xmm6, [rax + 32]
+
+ movdqa xmm7, XMMWORD PTR [rsi] ; line 1
+ movdqa xmm0, XMMWORD PTR [rsi + 16]
+ pmullw xmm7, [rax]
+ pmullw xmm0, [rax]
+
+ paddsw xmm1, xmm3
+ paddsw xmm2, xmm4
+ paddsw xmm1, xmm5
+ paddsw xmm2, xmm6
+ paddsw xmm1, xmm7
+ paddsw xmm2, xmm0
+
+ add rsi, rdx
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
+ pmullw xmm3, [rax + 48]
+ pmullw xmm4, [rax + 48]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
+ pmullw xmm5, [rax + 80]
+ pmullw xmm6, [rax + 80]
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+ pxor xmm0, xmm0 ; clear xmm0
+
+ paddsw xmm1, xmm3
+ paddsw xmm2, xmm4
+ paddsw xmm1, xmm5
+ paddsw xmm2, xmm6
+
+ paddsw xmm1, xmm7
+ paddsw xmm2, xmm7
+
+ psraw xmm1, 7
+ psraw xmm2, 7
+
+ packuswb xmm1, xmm2 ; pack and saturate
+ movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(2) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_h6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+globalsym(vp8_filter_block1d8_h6_only_sse2)
+sym(vp8_filter_block1d8_h6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(5) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ;dst_ptich
+%endif
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d8_h6_only_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+
+ movq QWORD PTR [rdi], xmm4 ; store the results in the destination
+ lea rsi, [rsi + rax]
+
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(3) ;dst_ptich
+%else
+ add rdi, r8
+%endif
+ dec rcx
+
+ jnz .filter_block1d8_h6_only_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_h6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+globalsym(vp8_filter_block1d16_h6_only_sse2)
+sym(vp8_filter_block1d16_h6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(5) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ;dst_ptich
+%endif
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d16_h6_only_sse2_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ movq xmm2, MMWORD PTR [rsi +14]
+ pslldq xmm2, 8
+
+ por xmm2, xmm1
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0 ; lower 8 bytes
+
+ movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
+
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm2
+
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm2
+
+ movdqa xmm7, xmm2
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm2
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0 ; higher 8 bytes
+
+ movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(3) ;dst_ptich
+%else
+ add rdi, r8
+%endif
+
+ dec rcx
+ jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_v6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; Second-pass filter only when xoffset==0
+globalsym(vp8_filter_block1d8_v6_only_sse2)
+sym(vp8_filter_block1d8_v6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ mov rax, arg(5) ;vp8_filter
+
+ pxor xmm0, xmm0 ; clear xmm0
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_only_sse2_loop:
+ movq xmm1, MMWORD PTR [rsi]
+ movq xmm2, MMWORD PTR [rsi + rdx]
+ movq xmm3, MMWORD PTR [rsi + rdx * 2]
+ movq xmm5, MMWORD PTR [rsi + rdx * 4]
+ add rsi, rdx
+ movq xmm4, MMWORD PTR [rsi + rdx * 2]
+ movq xmm6, MMWORD PTR [rsi + rdx * 4]
+
+ punpcklbw xmm1, xmm0
+ pmullw xmm1, [rax]
+
+ punpcklbw xmm2, xmm0
+ pmullw xmm2, [rax + 16]
+
+ punpcklbw xmm3, xmm0
+ pmullw xmm3, [rax + 32]
+
+ punpcklbw xmm5, xmm0
+ pmullw xmm5, [rax + 64]
+
+ punpcklbw xmm4, xmm0
+ pmullw xmm4, [rax + 48]
+
+ punpcklbw xmm6, xmm0
+ pmullw xmm6, [rax + 80]
+
+ paddsw xmm2, xmm5
+ paddsw xmm2, xmm3
+
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+
+ paddsw xmm2, xmm6
+ paddsw xmm2, xmm7
+
+ psraw xmm2, 7
+ packuswb xmm2, xmm0 ; pack and saturate
+
+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_unpack_block1d16_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int output_height,
+; unsigned int output_width
+;)
+globalsym(vp8_unpack_block1d16_h6_sse2)
+sym(vp8_unpack_block1d16_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(3) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
+%endif
+
+.unpack_block1d16_h6_sse2_rowloop:
+ movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
+ movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ punpcklbw xmm1, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm1
+ movdqa XMMWORD Ptr [rdi + 16], xmm3
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(4) ;[output_width]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .unpack_block1d16_h6_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+rd:
+ times 8 dw 0x40
diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm
new file mode 100644
index 0000000000..17247227db
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm
@@ -0,0 +1,1515 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT 7
+
+SECTION .text
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;
+; This is an implementation of some of the SSE optimizations first seen in ffvp8
+;
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+globalsym(vp8_filter_block1d8_h6_ssse3)
+sym(vp8_filter_block1d8_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4
+
+ movdqa xmm7, [GLOBAL(rd)]
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+ mov rdi, arg(2) ;output_ptr
+
+ cmp esi, DWORD PTR [rax]
+ je vp8_filter_block1d8_h4_ssse3
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+ sub rdi, rdx
+;xmm3 free
+.filter_block1d8_h6_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm1, xmm0
+ pmaddubsw xmm0, xmm4
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+ pmaddubsw xmm1, xmm5
+
+ lea rdi, [rdi + rdx]
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+ dec rcx
+
+ paddsw xmm0, xmm1
+ paddsw xmm2, xmm7
+
+ paddsw xmm0, xmm2
+
+ psraw xmm0, 7
+
+ packuswb xmm0, xmm0
+
+ movq MMWORD Ptr [rdi], xmm0
+ jnz .filter_block1d8_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+vp8_filter_block1d8_h4_ssse3:
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
+ movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+ sub rdi, rdx
+
+.filter_block1d8_h4_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm2, xmm0
+ pshufb xmm0, xmm3
+
+ pshufb xmm2, xmm4
+ pmaddubsw xmm0, xmm5
+
+ lea rdi, [rdi + rdx]
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+ dec rcx
+
+ paddsw xmm0, xmm7
+
+ paddsw xmm0, xmm2
+
+ psraw xmm0, 7
+
+ packuswb xmm0, xmm0
+
+ movq MMWORD Ptr [rdi], xmm0
+
+ jnz .filter_block1d8_h4_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_filter_block1d16_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+globalsym(vp8_filter_block1d16_h6_ssse3)
+sym(vp8_filter_block1d16_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ mov rdi, arg(2) ;output_ptr
+
+ mov rsi, arg(0) ;src_ptr
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+.filter_block1d16_h6_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm1, xmm0
+ pmaddubsw xmm0, xmm4
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+ movq xmm3, MMWORD PTR [rsi + 6]
+
+ pmaddubsw xmm1, xmm5
+ movq xmm7, MMWORD PTR [rsi + 11]
+
+ pmaddubsw xmm2, xmm6
+ punpcklbw xmm3, xmm7
+
+ paddsw xmm0, xmm1
+ movdqa xmm1, xmm3
+
+ pmaddubsw xmm3, xmm4
+ paddsw xmm0, xmm2
+
+ movdqa xmm2, xmm1
+ paddsw xmm0, [GLOBAL(rd)]
+
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+
+ psraw xmm0, 7
+ pmaddubsw xmm1, xmm5
+
+ pmaddubsw xmm2, xmm6
+ packuswb xmm0, xmm0
+
+ lea rsi, [rsi + rax]
+ paddsw xmm3, xmm1
+
+ paddsw xmm3, xmm2
+
+ paddsw xmm3, [GLOBAL(rd)]
+
+ psraw xmm3, 7
+
+ packuswb xmm3, xmm3
+
+ punpcklqdq xmm0, xmm3
+
+ movdqa XMMWORD Ptr [rdi], xmm0
+
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .filter_block1d16_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block1d4_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+globalsym(vp8_filter_block1d4_h6_ssse3)
+sym(vp8_filter_block1d4_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+ movdqa xmm7, [GLOBAL(rd)]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d4_h4_ssse3
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+;xmm3 free
+.filter_block1d4_h6_rowloop_ssse3:
+ movdqu xmm0, XMMWORD PTR [rsi - 2]
+
+ movdqa xmm1, xmm0
+ pshufb xmm0, [GLOBAL(shuf1b)]
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2b)]
+ pmaddubsw xmm0, xmm4
+ pshufb xmm2, [GLOBAL(shuf3b)]
+ pmaddubsw xmm1, xmm5
+
+;--
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+;--
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ pxor xmm1, xmm1
+ paddsw xmm0, xmm2
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ movd DWORD PTR [rdi], xmm0
+
+ add rdi, rdx
+ dec rcx
+ jnz .filter_block1d4_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d4_h4_ssse3:
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+ movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
+ movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+.filter_block1d4_h4_rowloop_ssse3:
+ movdqu xmm1, XMMWORD PTR [rsi - 2]
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
+ pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
+ pmaddubsw xmm1, xmm5
+
+;--
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+;--
+ paddsw xmm1, xmm7
+ paddsw xmm1, xmm2
+ psraw xmm1, 7
+ packuswb xmm1, xmm1
+
+ movd DWORD PTR [rdi], xmm1
+
+ add rdi, rdx
+ dec rcx
+ jnz .filter_block1d4_h4_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;void vp8_filter_block1d16_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+globalsym(vp8_filter_block1d16_v6_ssse3)
+sym(vp8_filter_block1d16_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d16_v4_ssse3
+
+ movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+
+.vp8_filter_block1d16_v6_ssse3_loop:
+ movq xmm1, MMWORD PTR [rsi] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
+
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, [GLOBAL(rd)]
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2 ;store the results
+
+ movq xmm1, MMWORD PTR [rsi + 8] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, [GLOBAL(rd)]
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi+8], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d16_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d16_v4_ssse3:
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+.vp8_filter_block1d16_v4_ssse3_loop:
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ pmaddubsw xmm3, xmm6
+ pmaddubsw xmm2, xmm7
+ movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
+ movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
+
+ paddsw xmm2, [GLOBAL(rd)]
+ paddsw xmm2, xmm3
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ punpcklbw xmm5, xmm4 ;B D
+ punpcklbw xmm1, xmm0 ;C E
+
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm5, xmm7
+
+ movdqa xmm4, [GLOBAL(rd)]
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm5, xmm1
+ paddsw xmm5, xmm4
+ psraw xmm5, 7
+ packuswb xmm5, xmm5
+
+ punpcklqdq xmm2, xmm5
+
+ movdqa XMMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d16_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block1d8_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+globalsym(vp8_filter_block1d8_v6_ssse3)
+sym(vp8_filter_block1d8_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ; out_pitch
+%endif
+ movsxd rcx, DWORD PTR arg(4) ;[output_height]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d8_v4_ssse3
+
+ movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d8_v6_ssse3_loop:
+ movq xmm1, MMWORD PTR [rsi] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
+ movdqa xmm4, [GLOBAL(rd)]
+
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d8_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d8_v4_ssse3:
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+ movdqa xmm5, [GLOBAL(rd)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d8_v4_ssse3_loop:
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ pmaddubsw xmm3, xmm6
+ pmaddubsw xmm2, xmm7
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm5
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d8_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_filter_block1d4_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+globalsym(vp8_filter_block1d4_v6_ssse3)
+sym(vp8_filter_block1d4_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ; out_pitch
+%endif
+ movsxd rcx, DWORD PTR arg(4) ;[output_height]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d4_v4_ssse3
+
+ movq mm5, MMWORD PTR [rax] ;k0_k5
+ movq mm6, MMWORD PTR [rax+256] ;k2_k4
+ movq mm7, MMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d4_v6_ssse3_loop:
+ movd mm1, DWORD PTR [rsi] ;A
+ movd mm2, DWORD PTR [rsi + rdx] ;B
+ movd mm3, DWORD PTR [rsi + rdx * 2] ;C
+ movd mm4, DWORD PTR [rax + rdx * 2] ;D
+ movd mm0, DWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw mm2, mm4 ;B D
+ punpcklbw mm3, mm0 ;C E
+
+ movd mm0, DWORD PTR [rax + rdx * 4] ;F
+
+ movq mm4, [GLOBAL(rd)]
+
+ pmaddubsw mm3, mm6
+ punpcklbw mm1, mm0 ;A F
+ pmaddubsw mm2, mm7
+ pmaddubsw mm1, mm5
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw mm2, mm3
+ paddsw mm2, mm1
+ paddsw mm2, mm4
+ psraw mm2, 7
+ packuswb mm2, mm2
+
+ movd DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d4_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d4_v4_ssse3:
+ movq mm6, MMWORD PTR [rax+256] ;k2_k4
+ movq mm7, MMWORD PTR [rax+128] ;k1_k3
+ movq mm5, MMWORD PTR [GLOBAL(rd)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d4_v4_ssse3_loop:
+ movd mm2, DWORD PTR [rsi + rdx] ;B
+ movd mm3, DWORD PTR [rsi + rdx * 2] ;C
+ movd mm4, DWORD PTR [rax + rdx * 2] ;D
+ movd mm0, DWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw mm2, mm4 ;B D
+ punpcklbw mm3, mm0 ;C E
+
+ pmaddubsw mm3, mm6
+ pmaddubsw mm2, mm7
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw mm2, mm3
+ paddsw mm2, mm5
+ psraw mm2, 7
+ packuswb mm2, mm2
+
+ movd DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d4_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_bilinear_predict16x16_ssse3
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+globalsym(vp8_bilinear_predict16x16_ssse3)
+sym(vp8_bilinear_predict16x16_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+ movsxd rax, dword ptr arg(2) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .b16x16_sp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; HFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ mov rsi, arg(0) ; src_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm1, [rax]
+
+ movsxd rax, dword ptr arg(3) ; yoffset
+
+ cmp rax, 0 ; skip second_pass filter if yoffset=0
+ je .b16x16_fp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
+
+ movdqa xmm2, [rax]
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ; dst_pitch
+%endif
+ movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+ movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
+
+ lea rsi, [rsi + rdx] ; next line
+
+ pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
+
+ punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
+ pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
+ psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+ movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm6, xmm5
+ movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
+ lea rsi, [rsi + rdx] ; next line
+
+ pmaddubsw xmm6, xmm1
+
+ punpcklbw xmm4, xmm5
+ pmaddubsw xmm4, xmm1
+
+ paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
+ psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
+
+ paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
+ psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
+
+ packuswb xmm6, xmm4
+ movdqa xmm5, xmm7
+
+ punpcklbw xmm5, xmm6
+ pmaddubsw xmm5, xmm2
+
+ punpckhbw xmm7, xmm6
+ pmaddubsw xmm7, xmm2
+
+ paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
+ psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
+
+ paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
+ psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
+
+ packuswb xmm5, xmm7
+ movdqa xmm7, xmm6
+
+ movdqa [rdi], xmm5 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(5) ; dst_pitch
+%else
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done
+
+.b16x16_sp_only:
+ movsxd rax, dword ptr arg(3) ; yoffset
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ mov rsi, arg(0) ; src_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm1, [rax] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ; src_pixels_per_line
+
+ ; get the first horizontal line done
+ movq xmm4, [rsi] ; load row 0
+ movq xmm2, [rsi + 8] ; load row 0
+
+ lea rsi, [rsi + rax] ; next line
+.next_row_sp:
+ movq xmm3, [rsi] ; load row + 1
+ movq xmm5, [rsi + 8] ; load row + 1
+
+ punpcklbw xmm4, xmm3
+ punpcklbw xmm2, xmm5
+
+ pmaddubsw xmm4, xmm1
+ movq xmm7, [rsi + rax] ; load row + 2
+
+ pmaddubsw xmm2, xmm1
+ movq xmm6, [rsi + rax + 8] ; load row + 2
+
+ punpcklbw xmm3, xmm7
+ punpcklbw xmm5, xmm6
+
+ pmaddubsw xmm3, xmm1
+ paddw xmm4, [GLOBAL(rd)]
+
+ pmaddubsw xmm5, xmm1
+ paddw xmm2, [GLOBAL(rd)]
+
+ psraw xmm4, VP8_FILTER_SHIFT
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ packuswb xmm4, xmm2
+ paddw xmm3, [GLOBAL(rd)]
+
+ movdqa [rdi], xmm4 ; store row 0
+ paddw xmm5, [GLOBAL(rd)]
+
+ psraw xmm3, VP8_FILTER_SHIFT
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm5
+ movdqa xmm4, xmm7
+
+ movdqa [rdi + rdx],xmm3 ; store row 1
+ lea rsi, [rsi + 2*rax]
+
+ movdqa xmm2, xmm6
+ lea rdi, [rdi + 2*rdx]
+
+ cmp rdi, rcx
+ jne .next_row_sp
+
+ jmp .done
+
+.b16x16_fp_only:
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ; src_pixels_per_line
+
+.next_row_fp:
+ movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm2, xmm4
+ movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ pmaddubsw xmm2, xmm1
+ movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
+
+ lea rsi, [rsi + rax] ; next line
+ punpcklbw xmm3, xmm4
+
+ pmaddubsw xmm3, xmm1
+ movq xmm5, [rsi]
+
+ paddw xmm2, [GLOBAL(rd)]
+ movq xmm7, [rsi+1]
+
+ movq xmm6, [rsi+8]
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ punpcklbw xmm5, xmm7
+ movq xmm7, [rsi+9]
+
+ paddw xmm3, [GLOBAL(rd)]
+ pmaddubsw xmm5, xmm1
+
+ psraw xmm3, VP8_FILTER_SHIFT
+ punpcklbw xmm6, xmm7
+
+ packuswb xmm2, xmm3
+ pmaddubsw xmm6, xmm1
+
+ movdqa [rdi], xmm2 ; store the results in the destination
+ paddw xmm5, [GLOBAL(rd)]
+
+ lea rdi, [rdi + rdx] ; dst_pitch
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ paddw xmm6, [GLOBAL(rd)]
+ psraw xmm6, VP8_FILTER_SHIFT
+
+ packuswb xmm5, xmm6
+ lea rsi, [rsi + rax] ; next line
+
+ movdqa [rdi], xmm5 ; store the results in the destination
+ lea rdi, [rdi + rdx] ; dst_pitch
+
+ cmp rdi, rcx
+
+ jne .next_row_fp
+
+.done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_bilinear_predict8x8_ssse3
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+globalsym(vp8_bilinear_predict8x8_ssse3)
+sym(vp8_bilinear_predict8x8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 144 ; reserve 144 bytes
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ ;Read 9-line unaligned data in and put them on stack. This gives a big
+ ;performance boost.
+ movdqu xmm0, [rsi]
+ lea rax, [rdx + rdx*2]
+ movdqu xmm1, [rsi+rdx]
+ movdqu xmm2, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi+rdx]
+ movdqu xmm5, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm6, [rsi]
+ movdqu xmm7, [rsi+rdx]
+
+ movdqa XMMWORD PTR [rsp], xmm0
+
+ movdqu xmm0, [rsi+rdx*2]
+
+ movdqa XMMWORD PTR [rsp+16], xmm1
+ movdqa XMMWORD PTR [rsp+32], xmm2
+ movdqa XMMWORD PTR [rsp+48], xmm3
+ movdqa XMMWORD PTR [rsp+64], xmm4
+ movdqa XMMWORD PTR [rsp+80], xmm5
+ movdqa XMMWORD PTR [rsp+96], xmm6
+ movdqa XMMWORD PTR [rsp+112], xmm7
+ movdqa XMMWORD PTR [rsp+128], xmm0
+
+ movsxd rax, dword ptr arg(2) ; xoffset
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .b8x8_sp_only
+
+ shl rax, 4
+ add rax, rcx ; HFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm0, [rax]
+
+ movsxd rax, dword ptr arg(3) ; yoffset
+ cmp rax, 0 ; skip second_pass filter if yoffset=0
+ je .b8x8_fp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+
+ movdqa xmm1, [rax]
+
+ ; get the first horizontal line done
+ movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
+
+ psrldq xmm5, 1
+ lea rsp, [rsp + 16] ; next line
+
+ punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+ pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+ movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ lea rsp, [rsp + 16] ; next line
+
+ movdqa xmm5, xmm6
+
+ psrldq xmm5, 1
+
+ punpcklbw xmm6, xmm5
+ pmaddubsw xmm6, xmm0
+
+ paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
+ psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
+
+ packuswb xmm6, xmm6
+
+ punpcklbw xmm7, xmm6
+ pmaddubsw xmm7, xmm1
+
+ paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
+ psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
+
+ packuswb xmm7, xmm7
+
+ movq [rdi], xmm7 ; store the results in the destination
+ lea rdi, [rdi + rdx]
+
+ movdqa xmm7, xmm6
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done8x8
+
+.b8x8_sp_only:
+ movsxd rax, dword ptr arg(3) ; yoffset
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm0, [rax] ; VFilter
+
+ movq xmm1, XMMWORD PTR [rsp]
+ movq xmm2, XMMWORD PTR [rsp+16]
+
+ movq xmm3, XMMWORD PTR [rsp+32]
+ punpcklbw xmm1, xmm2
+
+ movq xmm4, XMMWORD PTR [rsp+48]
+ punpcklbw xmm2, xmm3
+
+ movq xmm5, XMMWORD PTR [rsp+64]
+ punpcklbw xmm3, xmm4
+
+ movq xmm6, XMMWORD PTR [rsp+80]
+ punpcklbw xmm4, xmm5
+
+ movq xmm7, XMMWORD PTR [rsp+96]
+ punpcklbw xmm5, xmm6
+
+ ; Because the source register (xmm0) is always treated as signed by
+ ; pmaddubsw, the constant '128' is treated as '-128'.
+ pmaddubsw xmm1, xmm0
+ pmaddubsw xmm2, xmm0
+
+ pmaddubsw xmm3, xmm0
+ pmaddubsw xmm4, xmm0
+
+ pmaddubsw xmm5, xmm0
+ punpcklbw xmm6, xmm7
+
+ pmaddubsw xmm6, xmm0
+ paddw xmm1, [GLOBAL(rd)]
+
+ paddw xmm2, [GLOBAL(rd)]
+ psraw xmm1, VP8_FILTER_SHIFT
+
+ paddw xmm3, [GLOBAL(rd)]
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm3, VP8_FILTER_SHIFT
+
+ paddw xmm5, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ paddw xmm6, [GLOBAL(rd)]
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ psraw xmm6, VP8_FILTER_SHIFT
+
+ ; Having multiplied everything by '-128' and obtained negative
+ ; numbers, the unsigned saturation truncates those values to 0,
+ ; resulting in incorrect handling of xoffset == 0 && yoffset == 0
+ packuswb xmm1, xmm1
+
+ packuswb xmm2, xmm2
+ movq [rdi], xmm1
+
+ packuswb xmm3, xmm3
+ movq [rdi+rdx], xmm2
+
+ packuswb xmm4, xmm4
+ movq xmm1, XMMWORD PTR [rsp+112]
+
+ lea rdi, [rdi + 2*rdx]
+ movq xmm2, XMMWORD PTR [rsp+128]
+
+ packuswb xmm5, xmm5
+ movq [rdi], xmm3
+
+ packuswb xmm6, xmm6
+ movq [rdi+rdx], xmm4
+
+ lea rdi, [rdi + 2*rdx]
+ punpcklbw xmm7, xmm1
+
+ movq [rdi], xmm5
+ pmaddubsw xmm7, xmm0
+
+ movq [rdi+rdx], xmm6
+ punpcklbw xmm1, xmm2
+
+ pmaddubsw xmm1, xmm0
+ paddw xmm7, [GLOBAL(rd)]
+
+ psraw xmm7, VP8_FILTER_SHIFT
+ paddw xmm1, [GLOBAL(rd)]
+
+ psraw xmm1, VP8_FILTER_SHIFT
+ packuswb xmm7, xmm7
+
+ packuswb xmm1, xmm1
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm7
+
+ movq [rdi+rdx], xmm1
+ lea rsp, [rsp + 144]
+
+ jmp .done8x8
+
+.b8x8_fp_only:
+ lea rcx, [rdi+rdx*8]
+
+.next_row_fp:
+ movdqa xmm1, XMMWORD PTR [rsp]
+ movdqa xmm3, XMMWORD PTR [rsp+16]
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, XMMWORD PTR [rsp+32]
+
+ psrldq xmm2, 1
+ movdqa xmm7, XMMWORD PTR [rsp+48]
+
+ movdqa xmm4, xmm3
+ psrldq xmm4, 1
+
+ movdqa xmm6, xmm5
+ psrldq xmm6, 1
+
+ punpcklbw xmm1, xmm2
+ pmaddubsw xmm1, xmm0
+
+ punpcklbw xmm3, xmm4
+ pmaddubsw xmm3, xmm0
+
+ punpcklbw xmm5, xmm6
+ pmaddubsw xmm5, xmm0
+
+ movdqa xmm2, xmm7
+ psrldq xmm2, 1
+
+ punpcklbw xmm7, xmm2
+ pmaddubsw xmm7, xmm0
+
+ paddw xmm1, [GLOBAL(rd)]
+ psraw xmm1, VP8_FILTER_SHIFT
+
+ paddw xmm3, [GLOBAL(rd)]
+ psraw xmm3, VP8_FILTER_SHIFT
+
+ paddw xmm5, [GLOBAL(rd)]
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ paddw xmm7, [GLOBAL(rd)]
+ psraw xmm7, VP8_FILTER_SHIFT
+
+ packuswb xmm1, xmm1
+ packuswb xmm3, xmm3
+
+ packuswb xmm5, xmm5
+ movq [rdi], xmm1
+
+ packuswb xmm7, xmm7
+ movq [rdi+rdx], xmm3
+
+ lea rdi, [rdi + 2*rdx]
+ movq [rdi], xmm5
+
+ lea rsp, [rsp + 4*16]
+ movq [rdi+rdx], xmm7
+
+ lea rdi, [rdi + 2*rdx]
+ cmp rdi, rcx
+
+ jne .next_row_fp
+
+ lea rsp, [rsp + 16]
+
+.done8x8:
+ ;add rsp, 144
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+ db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
+shuf2b:
+ db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
+shuf3b:
+ db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
+
+align 16
+shuf2bfrom1:
+ db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
+align 16
+shuf3bfrom1:
+ db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
+
+align 16
+rd:
+ times 8 dw 0x40
+
+align 16
+k0_k5:
+ times 8 db 0, 0 ;placeholder
+ times 8 db 0, 0
+ times 8 db 2, 1
+ times 8 db 0, 0
+ times 8 db 3, 3
+ times 8 db 0, 0
+ times 8 db 1, 2
+ times 8 db 0, 0
+k1_k3:
+ times 8 db 0, 0 ;placeholder
+ times 8 db -6, 12
+ times 8 db -11, 36
+ times 8 db -9, 50
+ times 8 db -16, 77
+ times 8 db -6, 93
+ times 8 db -8, 108
+ times 8 db -1, 123
+k2_k4:
+ times 8 db 128, 0 ;placeholder
+ times 8 db 123, -1
+ times 8 db 108, -8
+ times 8 db 93, -6
+ times 8 db 77, -16
+ times 8 db 50, -9
+ times 8 db 36, -11
+ times 8 db 12, -6
+align 16
+vp8_bilinear_filters_ssse3:
+ times 8 db 128, 0
+ times 8 db 112, 16
+ times 8 db 96, 32
+ times 8 db 80, 48
+ times 8 db 64, 64
+ times 8 db 48, 80
+ times 8 db 32, 96
+ times 8 db 16, 112
+
diff --git a/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c b/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c
new file mode 100644
index 0000000000..7fb83c2d5e
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+
+extern const short vp8_six_tap_x86[8][6 * 8];
+
+extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter);
+extern void vp8_filter_block1dc_v6_mmx(
+ unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
+ unsigned int pixels_per_line, unsigned int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const short *vp8_filter);
+extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter);
+extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter);
+extern void vp8_filter_block1d8_v6_sse2(
+ unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
+ unsigned int pixels_per_line, unsigned int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const short *vp8_filter);
+extern void vp8_filter_block1d16_v6_sse2(
+ unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
+ unsigned int pixels_per_line, unsigned int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const short *vp8_filter);
+extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ unsigned int output_width);
+extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter);
+extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter);
+extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter);
+
+#if HAVE_MMX
+void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED(16, unsigned short,
+ FData2[16 * 16]); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+ HFilter = vp8_six_tap_x86[xoffset];
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
+ src_pixels_per_line, 1, 9, 8, HFilter);
+ VFilter = vp8_six_tap_x86[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
+ VFilter);
+}
+#endif
+
+#if HAVE_SSE2
+void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED(16, unsigned short,
+ FData2[24 * 24]); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+ if (xoffset) {
+ if (yoffset) {
+ HFilter = vp8_six_tap_x86[xoffset];
+ vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
+ src_pixels_per_line, 1, 21, 32, HFilter);
+ VFilter = vp8_six_tap_x86[yoffset];
+ vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
+ dst_pitch, VFilter);
+ } else {
+ /* First-pass only */
+ HFilter = vp8_six_tap_x86[xoffset];
+ vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
+ dst_pitch, 16, HFilter);
+ }
+ } else {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_x86[yoffset];
+ vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
+ src_pixels_per_line, 21, 32);
+ vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
+ dst_pitch, VFilter);
+ }
+}
+
+void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset,
+ unsigned char *dst_ptr, int dst_pitch) {
+ DECLARE_ALIGNED(16, unsigned short,
+ FData2[256]); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+
+ if (xoffset) {
+ if (yoffset) {
+ HFilter = vp8_six_tap_x86[xoffset];
+ vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
+ src_pixels_per_line, 1, 13, 16, HFilter);
+ VFilter = vp8_six_tap_x86[yoffset];
+ vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
+ dst_pitch, VFilter);
+ } else {
+ /* First-pass only */
+ HFilter = vp8_six_tap_x86[xoffset];
+ vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
+ dst_pitch, 8, HFilter);
+ }
+ } else {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_x86[yoffset];
+ vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, dst_ptr, dst_pitch, 8,
+ VFilter);
+ }
+}
+
+void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset,
+ unsigned char *dst_ptr, int dst_pitch) {
+ DECLARE_ALIGNED(16, unsigned short,
+ FData2[256]); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+
+ if (xoffset) {
+ if (yoffset) {
+ HFilter = vp8_six_tap_x86[xoffset];
+ vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
+ src_pixels_per_line, 1, 9, 16, HFilter);
+ VFilter = vp8_six_tap_x86[yoffset];
+ vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
+ dst_pitch, VFilter);
+ } else {
+ /* First-pass only */
+ HFilter = vp8_six_tap_x86[xoffset];
+ vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
+ dst_pitch, 4, HFilter);
+ }
+ } else {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_x86[yoffset];
+ vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, dst_ptr, dst_pitch, 4,
+ VFilter);
+ }
+}
+
+#endif
+
+#if HAVE_SSSE3
+
+extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index);
+
+extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index);
+
+extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index);
+
+extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index);
+
+extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index);
+
+extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index);
+
+void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
+
+ if (xoffset) {
+ if (yoffset) {
+ vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2, 16, 21,
+ xoffset);
+ vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
+ yoffset);
+ } else {
+ /* First-pass only */
+ vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
+ dst_pitch, 16, xoffset);
+ }
+ } else {
+ if (yoffset) {
+ /* Second-pass only */
+ vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, dst_ptr, dst_pitch, 16,
+ yoffset);
+ } else {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED(16, unsigned char, FData2[256]);
+
+ if (xoffset) {
+ if (yoffset) {
+ vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2, 8, 13, xoffset);
+ vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
+ } else {
+ vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
+ dst_pitch, 8, xoffset);
+ }
+ } else {
+ if (yoffset) {
+ /* Second-pass only */
+ vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, dst_ptr, dst_pitch, 8,
+ yoffset);
+ } else {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED(16, unsigned char, FData2[256]);
+
+ if (xoffset) {
+ if (yoffset) {
+ vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2, 8, 9, xoffset);
+ vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
+ } else {
+ /* First-pass only */
+ vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
+ dst_pitch, 4, xoffset);
+ }
+ } else {
+ if (yoffset) {
+ /* Second-pass only */
+ vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, dst_ptr, dst_pitch, 4,
+ yoffset);
+ } else {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line, int xoffset,
+ int yoffset, unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
+
+ if (xoffset) {
+ if (yoffset) {
+ vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2, 4, 9, xoffset);
+ vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
+ } else {
+ vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
+ dst_pitch, 4, xoffset);
+ }
+ } else {
+ if (yoffset) {
+ vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, dst_ptr, dst_pitch, 4,
+ yoffset);
+ } else {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ int r;
+
+ for (r = 0; r < 4; ++r) {
+ dst_ptr[0] = src_ptr[0];
+ dst_ptr[1] = src_ptr[1];
+ dst_ptr[2] = src_ptr[2];
+ dst_ptr[3] = src_ptr[3];
+ dst_ptr += dst_pitch;
+ src_ptr += src_pixels_per_line;
+ }
+ }
+ }
+}
+
+#endif