diff options
Diffstat (limited to 'media/libvpx/libvpx/vp8/common/arm')
15 files changed, 4918 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c new file mode 100644 index 0000000000..48a1972048 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vp8/common/arm/loopfilter_arm.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/onyxc_int.h" + +/* NEON loopfilter functions */ +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, + hev_thr); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, + hev_thr, v_ptr); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, + hev_thr, v_ptr); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, + lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, + lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, + lim, hev_thr); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, + blim, lim, hev_thr, + v_ptr + 4 * uv_stride); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, + hev_thr); + + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, + hev_thr, v_ptr + 4); +} diff --git a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h new file mode 100644 index 0000000000..6cf660d228 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ +#define VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ + +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, + unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, + unsigned char thresh, unsigned char *v); + +loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; +loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; +loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; +loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; + +loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; +loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; +loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; + +#endif // VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c new file mode 100644 index 0000000000..590956dde1 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -0,0 +1,764 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <string.h> + +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" + +static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 }, + { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, + { 32, 96 }, { 16, 112 } }; + +static INLINE uint8x8_t load_and_shift(const unsigned char *a) { + return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32)); +} + +void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t e0, e1, e2; + + if (xoffset == 0) { // skip_1stpass_filter + uint8x8_t a0, a1, a2, a3, a4; + + a0 = load_and_shift(src_ptr); + src_ptr += src_pixels_per_line; + a1 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a2 = load_and_shift(src_ptr); + src_ptr += src_pixels_per_line; + a3 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a4 = vld1_u8(src_ptr); + + e0 = vext_u8(a0, a1, 4); + e1 = vext_u8(a2, a3, 4); + e2 = a4; + } else { + uint8x8_t a0, a1, a2, a3, a4, b4; + uint8x16_t a01, a23; + uint8x16_t b01, b23; + uint32x2x2_t c0, c1, c2, c3; + uint16x8_t d0, d1, d2; + const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + const uint8x8_t filter1 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + a0 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a1 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a2 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a3 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + a4 = vld1_u8(src_ptr); + + a01 = vcombine_u8(a0, a1); + a23 = vcombine_u8(a2, a3); + + b01 = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(a01), 8)); + b23 = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(a23), 8)); + b4 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(a4), 8)); + + c0 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a01)), + vreinterpret_u32_u8(vget_high_u8(a01))); + c1 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a23)), + vreinterpret_u32_u8(vget_high_u8(a23))); + c2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b01)), + vreinterpret_u32_u8(vget_high_u8(b01))); + c3 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b23)), + vreinterpret_u32_u8(vget_high_u8(b23))); + + d0 = vmull_u8(vreinterpret_u8_u32(c0.val[0]), filter0); + d1 = vmull_u8(vreinterpret_u8_u32(c1.val[0]), filter0); + d2 = vmull_u8(a4, filter0); + + d0 = vmlal_u8(d0, vreinterpret_u8_u32(c2.val[0]), filter1); + d1 = vmlal_u8(d1, vreinterpret_u8_u32(c3.val[0]), filter1); + d2 = vmlal_u8(d2, b4, filter1); + + e0 = vqrshrn_n_u16(d0, 7); + e1 = vqrshrn_n_u16(d1, 7); + e2 = vqrshrn_n_u16(d2, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1)); + } else { + uint8x8_t f0, f1; + const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + const uint8x8_t filter1 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + uint16x8_t b0 = vmull_u8(e0, filter0); + uint16x8_t b1 = vmull_u8(e1, filter0); + + const uint8x8_t a0 = vext_u8(e0, e1, 4); + const uint8x8_t a1 = vext_u8(e1, e2, 4); + + b0 = vmlal_u8(b0, a0, filter1); + b1 = vmlal_u8(b1, a1, filter1); + + f0 = vqrshrn_n_u16(b0, 7); + f1 = vqrshrn_n_u16(b1, 7); + + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(f0, f1)); + } +} + +void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8; + uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + d26u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); + } + return; +} + +void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d27u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d28u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d29u8 = vld1_u8(src_ptr); + src_ptr += src_pixels_per_line; + d30u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + + // first_pass filtering on the rest 5-line data + q1u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d26u8 = vqrshrn_n_u16(q6u16, 7); + d27u8 = vqrshrn_n_u16(q7u16, 7); + d28u8 = vqrshrn_n_u16(q8u16, 7); + d29u8 = vqrshrn_n_u16(q9u16, 7); + d30u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d26u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d27u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d28u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d29u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + q5u16 = vmull_u8(d26u8, d0u8); + q6u16 = vmull_u8(d27u8, d0u8); + q7u16 = vmull_u8(d28u8, d0u8); + q8u16 = vmull_u8(d29u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + q5u16 = vmlal_u8(q5u16, d27u8, d1u8); + q6u16 = vmlal_u8(q6u16, d28u8, d1u8); + q7u16 = vmlal_u8(q7u16, d29u8, d1u8); + q8u16 = vmlal_u8(q8u16, d30u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d9u8); + } + return; +} + +void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + int i; + unsigned char tmp[272]; + unsigned char *tmpp; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); + dst_ptr += dst_pitch; + } + return; + } + + if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + for (i = 4; i > 0; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)dst_ptr, q7u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q8u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q9u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q10u8); + dst_ptr += dst_pitch; + } + return; + } + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + tmpp = tmp; + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); + tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + tmpp = tmp; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); + tmpp += 16; + q13u8 = vld1q_u8(tmpp); + tmpp += 16; + q14u8 = vld1q_u8(tmpp); + tmpp += 16; + q15u8 = vld1q_u8(tmpp); + tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); + dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); + dst_ptr += dst_pitch; + } + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c new file mode 100644 index 0000000000..c89b47d628 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vp8_rtcd.h" + +void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 4; ++r) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 8; ++r) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + int r; + uint8x16_t qtmp; + + for (r = 0; r < 16; ++r) { + qtmp = vld1q_u8(src); + vst1q_u8(dst, qtmp); + src += src_stride; + dst += dst_stride; + } +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c new file mode 100644 index 0000000000..d12c3a8392 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vp8_rtcd.h" + +void vp8_dc_only_idct_add_neon(int16_t input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int i; + uint16_t a1 = ((input_dc + 4) >> 3); + uint32x2_t d2u32 = vdup_n_u32(0); + uint8x8_t d2u8; + uint16x8_t q1u16; + uint16x8_t qAdd; + + qAdd = vdupq_n_u16(a1); + + for (i = 0; i < 2; ++i) { + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0); + pred_ptr += pred_stride; + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_stride; + } +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/dequant_idct_neon.c new file mode 100644 index 0000000000..5445f2965a --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/dequant_idct_neon.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vp8_rtcd.h" + +static const int16_t cospi8sqrt2minus1 = 20091; +// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of +// the way it is used in vqdmulh, where the result is doubled, it can be divided +// by 2 beforehand. This saves compensating for the negative value as well as +// shifting the result. +static const int16_t sinpi8sqrt2 = 35468 >> 1; + +void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, + int stride) { + unsigned char *dst0; + int32x2_t d14, d15; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + int16x8_t q1, q2, q3, q4, q5, q6; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x2x2_t d2tmp0, d2tmp1; + int16x4x2_t d2tmp2, d2tmp3; + + d14 = d15 = vdup_n_s32(0); + + // load input + q3 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + input += 8; + q4 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + + // load dq + q5 = vld1q_s16(dq); + dq += 8; + q6 = vld1q_s16(dq); + + // load src from dst + dst0 = dst; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0); + dst0 += stride; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1); + + q1 = vreinterpretq_s16_u16( + vmulq_u16(vreinterpretq_u16_s16(q3), vreinterpretq_u16_s16(q5))); + q2 = vreinterpretq_s16_u16( + vmulq_u16(vreinterpretq_u16_s16(q4), vreinterpretq_u16_s16(q6))); + + d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2)); + d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2)); + + q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2)); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + q4 = vshrq_n_s16(q4, 1); + + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + // loop 2 + q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); + d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); + + q4 = vshrq_n_s16(q4, 1); + + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]); + q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]); + + q1 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q1), vreinterpret_u8_s32(d14))); + q2 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2), vreinterpret_u8_s32(d15))); + + d14 = vreinterpret_s32_u8(vqmovun_s16(q1)); + d15 = vreinterpret_s32_u8(vqmovun_s16(q2)); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d14, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d14, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 1); + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c new file mode 100644 index 0000000000..791aaea2ae --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" + +void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) { + int16x8x2_t qQ, qDQC, qDQ; + + qQ = vld2q_s16(d->qcoeff); + qDQC = vld2q_s16(DQC); + + qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]); + qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]); + + vst2q_s16(d->dqcoeff, qDQ); +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c new file mode 100644 index 0000000000..5c26ce67a4 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vp8_rtcd.h" + +static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst, + int stride) { + unsigned char *dst0; + int i, a0, a1; + int16x8x2_t q2Add; + int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0); + uint8x8_t d2u8, d4u8; + uint16x8_t q1u16, q2u16; + + a0 = ((q[0] * dq) + 4) >> 3; + a1 = ((q[16] * dq) + 4) >> 3; + q[0] = q[16] = 0; + q2Add.val[0] = vdupq_n_s16((int16_t)a0); + q2Add.val[1] = vdupq_n_s16((int16_t)a1); + + for (i = 0; i < 2; i++, dst += 4) { + dst0 = dst; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); + dst0 += stride; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d2s32)); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d4s32)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + + d2s32 = vreinterpret_s32_u8(d2u8); + d4s32 = vreinterpret_s32_u8(d4u8); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d2s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d2s32, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 1); + } +} + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 17734; +// because the lowest bit in 0x8a8c is 0, we can pre-shift this + +static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq, + unsigned char *dst, int stride) { + unsigned char *dst0, *dst1; + int32x2_t d28, d29, d30, d31; + int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x4x2_t q2tmp0, q2tmp1; + int16x8x2_t q2tmp2, q2tmp3; + int16x4_t dLow0, dLow1, dHigh0, dHigh1; + + d28 = d29 = d30 = d31 = vdup_n_s32(0); + + // load dq + q0 = vld1q_s16(dq); + dq += 8; + q1 = vld1q_s16(dq); + + // load q + q2 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q3 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q4 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q5 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + + // load src from dst + dst0 = dst; + dst1 = dst + 4; + d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); + dst0 += stride; + d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); + dst1 += stride; + d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); + dst0 += stride; + d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); + dst1 += stride; + + d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); + dst0 += stride; + d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); + dst1 += stride; + d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); + d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); + + q2 = vmulq_s16(q2, q0); + q3 = vmulq_s16(q3, q1); + q4 = vmulq_s16(q4, q0); + q5 = vmulq_s16(q5, q1); + + // vswp + dLow0 = vget_low_s16(q2); + dHigh0 = vget_high_s16(q2); + dLow1 = vget_low_s16(q4); + dHigh1 = vget_high_s16(q4); + q2 = vcombine_s16(dLow0, dLow1); + q4 = vcombine_s16(dHigh0, dHigh1); + + dLow0 = vget_low_s16(q3); + dHigh0 = vget_high_s16(q3); + dLow1 = vget_low_s16(q5); + dHigh1 = vget_high_s16(q5); + q3 = vcombine_s16(dLow0, dLow1); + q5 = vcombine_s16(dHigh0, dHigh1); + + q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); + q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); + q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); + q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); + + q10 = vqaddq_s16(q2, q3); + q11 = vqsubq_s16(q2, q3); + + q8 = vshrq_n_s16(q8, 1); + q9 = vshrq_n_s16(q9, 1); + + q4 = vqaddq_s16(q4, q8); + q5 = vqaddq_s16(q5, q9); + + q2 = vqsubq_s16(q6, q5); + q3 = vqaddq_s16(q7, q4); + + q4 = vqaddq_s16(q10, q3); + q5 = vqaddq_s16(q11, q2); + q6 = vqsubq_s16(q11, q2); + q7 = vqsubq_s16(q10, q3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + // loop 2 + q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); + q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); + q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); + q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); + + q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); + q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); + + q10 = vshrq_n_s16(q10, 1); + q11 = vshrq_n_s16(q11, 1); + + q10 = vqaddq_s16(q2tmp2.val[1], q10); + q11 = vqaddq_s16(q2tmp3.val[1], q11); + + q8 = vqsubq_s16(q8, q11); + q9 = vqaddq_s16(q9, q10); + + q4 = vqaddq_s16(q2, q9); + q5 = vqaddq_s16(q3, q8); + q6 = vqsubq_s16(q3, q8); + q7 = vqsubq_s16(q2, q9); + + q4 = vrshrq_n_s16(q4, 3); + q5 = vrshrq_n_s16(q5, 3); + q6 = vrshrq_n_s16(q6, 3); + q7 = vrshrq_n_s16(q7, 3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + q4 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28))); + q5 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29))); + q6 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30))); + q7 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31))); + + d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); + d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); + d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); + d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); + + dst0 = dst; + dst1 = dst + 4; + vst1_lane_s32((int32_t *)dst0, d28, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d28, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d29, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d29, 1); + dst1 += stride; + + vst1_lane_s32((int32_t *)dst0, d30, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d30, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d31, 0); + vst1_lane_s32((int32_t *)dst1, d31, 1); +} + +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, + int stride, char *eobs) { + int i; + + for (i = 0; i < 4; ++i) { + if (((short *)(eobs))[0]) { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon(q, dq, dst, stride); + else + idct_dequant_0_2x_neon(q, dq[0], dst, stride); + } + + if (((short *)(eobs))[1]) { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon(q + 32, dq, dst + 8, stride); + else + idct_dequant_0_2x_neon(q + 32, dq[0], dst + 8, stride); + } + q += 64; + dst += 4 * stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, + unsigned char *dst_u, + unsigned char *dst_v, int stride, + char *eobs) { + if (((short *)(eobs))[0]) { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon(q, dq, dst_u, stride); + else + idct_dequant_0_2x_neon(q, dq[0], dst_u, stride); + } + + q += 32; + dst_u += 4 * stride; + + if (((short *)(eobs))[1]) { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon(q, dq, dst_u, stride); + else + idct_dequant_0_2x_neon(q, dq[0], dst_u, stride); + } + + q += 32; + + if (((short *)(eobs))[2]) { + if (((short *)eobs)[2] & 0xfefe) + idct_dequant_full_2x_neon(q, dq, dst_v, stride); + else + idct_dequant_0_2x_neon(q, dq[0], dst_v, stride); + } + + q += 32; + dst_v += 4 * stride; + + if (((short *)(eobs))[3]) { + if (((short *)eobs)[3] & 0xfefe) + idct_dequant_full_2x_neon(q, dq, dst_v, stride); + else + idct_dequant_0_2x_neon(q, dq[0], dst_v, stride); + } +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c new file mode 100644 index 0000000000..91600bfc00 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vp8_rtcd.h" + +void vp8_short_inv_walsh4x4_neon(int16_t *input, int16_t *mb_dqcoeff) { + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x4_t d4s16, d5s16, d6s16, d7s16; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + int16x8_t qAdd3; + + q0s16 = vld1q_s16(input); + q1s16 = vld1q_s16(input + 8); + + // 1st for loop + d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)), + vreinterpret_s32_s16(vget_low_s16(q1s16))); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)), + vreinterpret_s32_s16(vget_high_s16(q1s16))); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), + vreinterpret_s16_s32(v2tmp3.val[0])); + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), + vreinterpret_s16_s32(v2tmp3.val[1])); + + // 2nd for loop + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + qAdd3 = vdupq_n_s16(3); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + q0s16 = vaddq_s16(q0s16, qAdd3); + q1s16 = vaddq_s16(q1s16, qAdd3); + + q0s16 = vshrq_n_s16(q0s16, 3); + q1s16 = vshrq_n_s16(q1s16, 3); + + // store + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3); + mb_dqcoeff += 16; + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c new file mode 100644 index 0000000000..df983b23a3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vp8_rtcd.h" + +static INLINE void vp8_loop_filter_simple_horizontal_edge_neon( + unsigned char *s, int p, const unsigned char *blimit) { + uint8_t *sp; + uint8x16_t qblimit, q0u8; + uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8; + int16x8_t q2s16, q3s16, q13s16; + int8x8_t d8s8, d9s8; + int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8; + + qblimit = vdupq_n_u8(*blimit); + + sp = s - (p << 1); + q5u8 = vld1q_u8(sp); + sp += p; + q6u8 = vld1q_u8(sp); + sp += p; + q7u8 = vld1q_u8(sp); + sp += p; + q8u8 = vld1q_u8(sp); + + q15u8 = vabdq_u8(q6u8, q7u8); + q14u8 = vabdq_u8(q5u8, q8u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q13s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + q7u8 = veorq_u8(q7u8, q0u8); + q8u8 = veorq_u8(q8u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)), + vget_low_s8(vreinterpretq_s8_u8(q6u8))); + q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)), + vget_high_s8(vreinterpretq_s8_u8(q6u8))); + + q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), vreinterpretq_s8_u8(q8u8)); + + q2s16 = vmulq_s16(q2s16, q13s16); + q3s16 = vmulq_s16(q3s16, q13s16); + + q10u8 = vdupq_n_u8(3); + q9u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8)); + q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8)); + + d8s8 = vqmovn_s16(q2s16); + d9s8 = vqmovn_s16(q3s16); + q4s8 = vcombine_s8(d8s8, d9s8); + + q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q3s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + vst1q_u8(s, q7u8); + s -= p; + vst1q_u8(s, q6u8); + return; +} + +void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c new file mode 100644 index 0000000000..fbc83ae290 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vpx_ports/arm.h" + +#ifdef VPX_INCOMPATIBLE_GCC +static INLINE void write_2x4(unsigned char *dst, int pitch, + const uint8x8x2_t result) { + /* + * uint8x8x2_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + --- + * after vtrn_u8 + 00 10 02 12 | 04 14 06 16 + 01 11 03 13 | 05 15 07 17 + */ + const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0], result.val[1]); + const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]); + const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]); + vst1_lane_u16((uint16_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 3); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 3); +} + +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + write_2x4(dst, pitch, result); + dst += pitch * 8; + write_2x4(dst, pitch, result2); +} +#else +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + vst2_lane_u8(dst, result, 0); + dst += pitch; + vst2_lane_u8(dst, result, 1); + dst += pitch; + vst2_lane_u8(dst, result, 2); + dst += pitch; + vst2_lane_u8(dst, result, 3); + dst += pitch; + vst2_lane_u8(dst, result, 4); + dst += pitch; + vst2_lane_u8(dst, result, 5); + dst += pitch; + vst2_lane_u8(dst, result, 6); + dst += pitch; + vst2_lane_u8(dst, result, 7); + dst += pitch; + + vst2_lane_u8(dst, result2, 0); + dst += pitch; + vst2_lane_u8(dst, result2, 1); + dst += pitch; + vst2_lane_u8(dst, result2, 2); + dst += pitch; + vst2_lane_u8(dst, result2, 3); + dst += pitch; + vst2_lane_u8(dst, result2, 4); + dst += pitch; + vst2_lane_u8(dst, result2, 5); + dst += pitch; + vst2_lane_u8(dst, result2, 6); + dst += pitch; + vst2_lane_u8(dst, result2, 7); +} +#endif // VPX_INCOMPATIBLE_GCC + +#ifdef VPX_INCOMPATIBLE_GCC +static INLINE uint8x8x4_t read_4x8(unsigned char *src, int pitch) { + uint8x8x4_t x; + const uint8x8_t a = vld1_u8(src); + const uint8x8_t b = vld1_u8(src + pitch * 1); + const uint8x8_t c = vld1_u8(src + pitch * 2); + const uint8x8_t d = vld1_u8(src + pitch * 3); + const uint8x8_t e = vld1_u8(src + pitch * 4); + const uint8x8_t f = vld1_u8(src + pitch * 5); + const uint8x8_t g = vld1_u8(src + pitch * 6); + const uint8x8_t h = vld1_u8(src + pitch * 7); + const uint32x2x2_t r04_u32 = + vtrn_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(e)); + const uint32x2x2_t r15_u32 = + vtrn_u32(vreinterpret_u32_u8(b), vreinterpret_u32_u8(f)); + const uint32x2x2_t r26_u32 = + vtrn_u32(vreinterpret_u32_u8(c), vreinterpret_u32_u8(g)); + const uint32x2x2_t r37_u32 = + vtrn_u32(vreinterpret_u32_u8(d), vreinterpret_u32_u8(h)); + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]), + vreinterpret_u16_u32(r26_u32.val[0])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]), + vreinterpret_u16_u32(r37_u32.val[0])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + /* + * after vtrn_u32 + 00 01 02 03 | 40 41 42 43 + 10 11 12 13 | 50 51 52 53 + 20 21 22 23 | 60 61 62 63 + 30 31 32 33 | 70 71 72 73 + --- + * after vtrn_u16 + 00 01 20 21 | 40 41 60 61 + 02 03 22 23 | 42 43 62 63 + 10 11 30 31 | 50 51 70 71 + 12 13 32 33 | 52 52 72 73 + + 00 01 20 21 | 40 41 60 61 + 10 11 30 31 | 50 51 70 71 + 02 03 22 23 | 42 43 62 63 + 12 13 32 33 | 52 52 72 73 + --- + * after vtrn_u8 + 00 10 20 30 | 40 50 60 70 + 01 11 21 31 | 41 51 61 71 + 02 12 22 32 | 42 52 62 72 + 03 13 23 33 | 43 53 63 73 + */ + x.val[0] = r01_u8.val[0]; + x.val[1] = r01_u8.val[1]; + x.val[2] = r23_u8.val[0]; + x.val[3] = r23_u8.val[1]; + + return x; +} +#else +static INLINE uint8x8x4_t read_4x8(unsigned char *src, int pitch) { + uint8x8x4_t x; + x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0); + x = vld4_lane_u8(src, x, 0); + src += pitch; + x = vld4_lane_u8(src, x, 1); + src += pitch; + x = vld4_lane_u8(src, x, 2); + src += pitch; + x = vld4_lane_u8(src, x, 3); + src += pitch; + x = vld4_lane_u8(src, x, 4); + src += pitch; + x = vld4_lane_u8(src, x, 5); + src += pitch; + x = vld4_lane_u8(src, x, 6); + src += pitch; + x = vld4_lane_u8(src, x, 7); + return x; +} +#endif // VPX_INCOMPATIBLE_GCC + +static INLINE void vp8_loop_filter_simple_vertical_edge_neon( + unsigned char *s, int p, const unsigned char *blimit) { + unsigned char *src1; + uint8x16_t qblimit, q0u8; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8; + int16x8_t q2s16, q13s16, q11s16; + int8x8_t d28s8, d29s8; + int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8; + uint8x8x4_t d0u8x4; // d6, d7, d8, d9 + uint8x8x4_t d1u8x4; // d10, d11, d12, d13 + uint8x8x2_t d2u8x2; // d12, d13 + uint8x8x2_t d3u8x2; // d14, d15 + + qblimit = vdupq_n_u8(*blimit); + + src1 = s - 2; + d0u8x4 = read_4x8(src1, p); + src1 += p * 8; + d1u8x4 = read_4x8(src1, p); + + q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10 + q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12 + q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11 + q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13 + + q15u8 = vabdq_u8(q5u8, q4u8); + q14u8 = vabdq_u8(q3u8, q6u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q11s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q3u8 = veorq_u8(q3u8, q0u8); + q4u8 = veorq_u8(q4u8, q0u8); + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)), + vget_low_s8(vreinterpretq_s8_u8(q5u8))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)), + vget_high_s8(vreinterpretq_s8_u8(q5u8))); + + q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8), vreinterpretq_s8_u8(q6u8)); + + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q11u8 = vdupq_n_u8(3); + q12u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8)); + + d28s8 = vqmovn_s16(q2s16); + d29s8 = vqmovn_s16(q13s16); + q14s8 = vcombine_s8(d28s8, d29s8); + + q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q14s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + d2u8x2.val[0] = vget_low_u8(q6u8); // d12 + d2u8x2.val[1] = vget_low_u8(q7u8); // d14 + d3u8x2.val[0] = vget_high_u8(q6u8); // d13 + d3u8x2.val[1] = vget_high_u8(q7u8); // d15 + + src1 = s - 1; + write_2x8(src1, p, d2u8x2, d3u8x2); +} + +void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c new file mode 100644 index 0000000000..fafaf2d451 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c @@ -0,0 +1,613 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "vp8/common/arm/loopfilter_arm.h" + +static INLINE void vp8_mbloop_filter_neon(uint8x16_t qblimit, // mblimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p2 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q4r, // p1 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r, // q1 + uint8x16_t *q9r) { // q1 + uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8; + uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16; + int8x16_t q0s8, q12s8, q14s8, q15s8; + int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q1u8 = vabdq_u8(q9, q8); + q0u8 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q1u8 = vmaxq_u8(q1u8, q0u8); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q12u8 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q1u8); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + q1u8 = vabdq_u8(q5, q8); + q12u8 = vqaddq_u8(q12u8, q12u8); + + // vp8_filter() function + // convert to signed + q0u8 = vdupq_n_u8(0x80); + q9 = veorq_u8(q9, q0u8); + q8 = veorq_u8(q8, q0u8); + q7 = veorq_u8(q7, q0u8); + q6 = veorq_u8(q6, q0u8); + q5 = veorq_u8(q5, q0u8); + q4 = veorq_u8(q4, q0u8); + + q1u8 = vshrq_n_u8(q1u8, 1); + q12u8 = vqaddq_u8(q12u8, q1u8); + + q14u8 = vorrq_u8(q13u8, q14u8); + q12u8 = vcgeq_u8(qblimit, q12u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); + + q11s16 = vdupq_n_s16(3); + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q15u8 = vandq_u8(q15u8, q12u8); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8)); + + q12u8 = vdupq_n_u8(3); + q11u8 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2 = vqmovn_s16(q2s16); + d3 = vqmovn_s16(q13s16); + q1s8 = vcombine_s8(d2, d3); + q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8)); + q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8)); + q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q13s8 = vshrq_n_s8(q13s8, 3); + + q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8); + q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8); + + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63); + d5 = vdup_n_s8(9); + d4 = vdup_n_s8(18); + + q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5); + q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5); + d5 = vdup_n_s8(27); + q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4); + q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4); + q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5); + q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5); + + d0 = vqshrn_n_s16(q0s16, 7); + d1 = vqshrn_n_s16(q11s16, 7); + d24 = vqshrn_n_s16(q12s16, 7); + d25 = vqshrn_n_s16(q13s16, 7); + d28 = vqshrn_n_s16(q14s16, 7); + d29 = vqshrn_n_s16(q15s16, 7); + + q0s8 = vcombine_s8(d0, d1); + q12s8 = vcombine_s8(d24, d25); + q14s8 = vcombine_s8(d28, d29); + + q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8); + q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8); + q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8); + q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8); + q15s8 = vqsubq_s8((q7s8), q14s8); + q14s8 = vqaddq_s8((q6s8), q14s8); + + q1u8 = vdupq_n_u8(0x80); + *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8); + *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8); + *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8); + return; +} + +void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q4, &q5, &q6, &q7, &q8, &q9); + + src -= (pitch * 6); + vst1q_u8(src, q4); + src += pitch; + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + src += pitch; + vst1q_u8(src, q9); + return; +} + +void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q4, &q5, &q6, &q7, &q8, &q9); + + u -= (pitch * 6); + v -= (pitch * 6); + vst1_u8(u, vget_low_u8(q4)); + u += pitch; + vst1_u8(v, vget_high_u8(q4)); + v += pitch; + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(u, vget_low_u8(q8)); + u += pitch; + vst1_u8(v, vget_high_u8(q8)); + v += pitch; + vst1_u8(u, vget_low_u8(q9)); + vst1_u8(v, vget_high_u8(q9)); + return; +} + +void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s1, *s2; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s1 = src - 4; + s2 = s1 + 8 * pitch; + d6 = vld1_u8(s1); + s1 += pitch; + d7 = vld1_u8(s2); + s2 += pitch; + d8 = vld1_u8(s1); + s1 += pitch; + d9 = vld1_u8(s2); + s2 += pitch; + d10 = vld1_u8(s1); + s1 += pitch; + d11 = vld1_u8(s2); + s2 += pitch; + d12 = vld1_u8(s1); + s1 += pitch; + d13 = vld1_u8(s2); + s2 += pitch; + d14 = vld1_u8(s1); + s1 += pitch; + d15 = vld1_u8(s2); + s2 += pitch; + d16 = vld1_u8(s1); + s1 += pitch; + d17 = vld1_u8(s2); + s2 += pitch; + d18 = vld1_u8(s1); + s1 += pitch; + d19 = vld1_u8(s2); + s2 += pitch; + d20 = vld1_u8(s1); + d21 = vld1_u8(s2); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + s1 -= 7 * pitch; + s2 -= 7 * pitch; + + vst1_u8(s1, vget_low_u8(q3)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q3)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q4)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q4)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q5)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q5)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q6)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q6)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q7)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q7)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q8)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q8)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q9)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q9)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q10)); + vst1_u8(s2, vget_high_u8(q10)); + return; +} + +void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + vs = v - 4; + d6 = vld1_u8(us); + us += pitch; + d7 = vld1_u8(vs); + vs += pitch; + d8 = vld1_u8(us); + us += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d10 = vld1_u8(us); + us += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d12 = vld1_u8(us); + us += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d14 = vld1_u8(us); + us += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d16 = vld1_u8(us); + us += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d18 = vld1_u8(us); + us += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d20 = vld1_u8(us); + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + ud = u - 4; + vst1_u8(ud, vget_low_u8(q3)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q4)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q5)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q6)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q7)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q8)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q9)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q10)); + + vd = v - 4; + vst1_u8(vd, vget_high_u8(q3)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q4)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q5)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q6)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q7)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q8)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q9)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q10)); + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c new file mode 100644 index 0000000000..2724ca236b --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vp8_rtcd.h" + +static const int16_t cospi8sqrt2minus1 = 20091; +// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of +// the way it is used in vqdmulh, where the result is doubled, it can be divided +// by 2 beforehand. This saves compensating for the negative value as well as +// shifting the result. +static const int16_t sinpi8sqrt2 = 35468 >> 1; + +void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int i; + uint32x2_t d6u32 = vdup_n_u32(0); + uint8x8_t d1u8; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + uint16x8_t q1u16; + int16x8_t q1s16, q2s16, q3s16, q4s16; + int32x2x2_t v2tmp0, v2tmp1; + int16x4x2_t v2tmp2, v2tmp3; + + d2 = vld1_s16(input); + d3 = vld1_s16(input + 4); + d4 = vld1_s16(input + 8); + d5 = vld1_s16(input + 12); + + // 1st for loop + q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here + q2s16 = vcombine_s16(d3, d5); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q4s16 = vshrq_n_s16(q4s16, 1); + + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + // 2nd for loop + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]); + q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q4s16 = vshrq_n_s16(q4s16, 1); + + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]); + q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]); + + // dc_only_idct_add + for (i = 0; i < 2; i++, q1s16 = q2s16) { + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0); + pred_ptr += pred_stride; + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), vreinterpret_u8_u32(d6u32)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1); + dst_ptr += dst_stride; + } + return; +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c new file mode 100644 index 0000000000..ee3c281f0f --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -0,0 +1,1729 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <string.h> +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_ports/mem.h" + +static const int8_t vp8_sub_pel_filters[8][8] = { + { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */ + { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */ + { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -9, 93, 50, -6, 0, 0, 0 }, + { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */ + { 0, -6, 50, 93, -9, 0, 0, 0 }, + { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -1, 12, 123, -6, 0, 0, 0 }, +}; + +// This table is derived from vp8/common/filter.c:vp8_sub_pel_filters. +// Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive. +// Elements 1 and 4 are either 0 or negative. The code accounts for this with +// multiply/accumulates which either add or subtract as needed. The other +// functions will be updated to use this table later. +// It is also expanded to 8 elements to allow loading into 64 bit neon +// registers. +static const uint8_t abs_filters[8][8] = { + { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 }, + { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 }, + { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 }, + { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 }, +}; + +static INLINE uint8x8_t load_and_shift(const unsigned char *a) { + return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32)); +} + +static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b, + const uint8x8_t filter, uint16x8_t *c, + uint16x8_t *d) { + const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), + vreinterpret_u32_u8(vget_high_u8(a))); + const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), + vreinterpret_u32_u8(vget_high_u8(b))); + *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); + *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); +} + +static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b, + const uint8x8_t filter, uint16x8_t *c, + uint16x8_t *d) { + const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), + vreinterpret_u32_u8(vget_high_u8(a))); + const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), + vreinterpret_u32_u8(vget_high_u8(b))); + *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); + *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); +} + +static INLINE void yonly4x4(const unsigned char *src, int src_stride, + int filter_offset, unsigned char *dst, + int dst_stride) { + uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8; + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; + uint16x8_t c0, c1, c2, c3; + int16x8_t d0, d1; + uint8x8_t e0, e1; + + const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]); + const uint8x8_t filter0 = vdup_lane_u8(filter, 0); + const uint8x8_t filter1 = vdup_lane_u8(filter, 1); + const uint8x8_t filter2 = vdup_lane_u8(filter, 2); + const uint8x8_t filter3 = vdup_lane_u8(filter, 3); + const uint8x8_t filter4 = vdup_lane_u8(filter, 4); + const uint8x8_t filter5 = vdup_lane_u8(filter, 5); + + src -= src_stride * 2; + // Shift the even rows to allow using 'vext' to combine the vectors. armv8 + // has vcopy_lane which would be interesting. This started as just a + // horrible workaround for clang adding alignment hints to 32bit loads: + // https://llvm.org/bugs/show_bug.cgi?id=24421 + // But it turns out it almost identical to casting the loads. + a0 = load_and_shift(src); + src += src_stride; + a1 = vld1_u8(src); + src += src_stride; + a2 = load_and_shift(src); + src += src_stride; + a3 = vld1_u8(src); + src += src_stride; + a4 = load_and_shift(src); + src += src_stride; + a5 = vld1_u8(src); + src += src_stride; + a6 = load_and_shift(src); + src += src_stride; + a7 = vld1_u8(src); + src += src_stride; + a8 = vld1_u8(src); + + // Combine the rows so we can operate on 8 at a time. + b0 = vext_u8(a0, a1, 4); + b2 = vext_u8(a2, a3, 4); + b4 = vext_u8(a4, a5, 4); + b6 = vext_u8(a6, a7, 4); + b8 = a8; + + // To keep with the 8-at-a-time theme, combine *alternate* rows. This + // allows combining the odd rows with the even. + b1 = vext_u8(b0, b2, 4); + b3 = vext_u8(b2, b4, 4); + b5 = vext_u8(b4, b6, 4); + b7 = vext_u8(b6, b8, 4); + + // Multiply and expand to 16 bits. + c0 = vmull_u8(b0, filter0); + c1 = vmull_u8(b2, filter0); + c2 = vmull_u8(b5, filter5); + c3 = vmull_u8(b7, filter5); + + // Multiply, subtract and accumulate for filters 1 and 4 (the negative + // ones). + c0 = vmlsl_u8(c0, b4, filter4); + c1 = vmlsl_u8(c1, b6, filter4); + c2 = vmlsl_u8(c2, b1, filter1); + c3 = vmlsl_u8(c3, b3, filter1); + + // Add more positive ones. vmlal should really return a signed type. + // It's doing signed math internally, as evidenced by the fact we can do + // subtractions followed by more additions. Ideally we could use + // vqmlal/sl but that instruction doesn't exist. Might be able to + // shoehorn vqdmlal/vqdmlsl in here but it would take some effort. + c0 = vmlal_u8(c0, b2, filter2); + c1 = vmlal_u8(c1, b4, filter2); + c2 = vmlal_u8(c2, b3, filter3); + c3 = vmlal_u8(c3, b5, filter3); + + // Use signed saturation math because vmlsl may have left some negative + // numbers in there. + d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); + d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); + + // Use signed again because numbers like -200 need to be saturated to 0. + e0 = vqrshrun_n_s16(d0, 7); + e1 = vqrshrun_n_s16(d1, 7); + + store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1)); +} + +void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch) { + uint8x16_t s0, s1, s2, s3, s4; + uint64x2_t s01, s23; + // Variables to hold src[] elements for the given filter[] + uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5; + uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4; + uint8x16_t s01_f0, s23_f0; + uint64x2_t s01_f3, s23_f3; + uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q; + // Accumulator variables. + uint16x8_t d0123, d4567, d89; + uint16x8_t d0123_a, d4567_a, d89_a; + int16x8_t e0123, e4567, e89; + // Second pass intermediates. + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; + uint16x8_t c0, c1, c2, c3; + int16x8_t d0, d1; + uint8x8_t e0, e1; + uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5; + + if (xoffset == 0) { // Second pass only. + yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch); + return; + } + + if (yoffset == 0) { // First pass only. + src_ptr -= 2; + } else { // Add context for the second pass. 2 extra lines on top. + src_ptr -= 2 + (src_pixels_per_line * 2); + } + + filter = vld1_u8(abs_filters[xoffset]); + filter0 = vdup_lane_u8(filter, 0); + filter1 = vdup_lane_u8(filter, 1); + filter2 = vdup_lane_u8(filter, 2); + filter3 = vdup_lane_u8(filter, 3); + filter4 = vdup_lane_u8(filter, 4); + filter5 = vdup_lane_u8(filter, 5); + + // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of + // garbage. So much effort for that last single bit. + // The low values of each pair are for filter0. + s0 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s1 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s2 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s3 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + // Shift to extract values for filter[5] + // If src[] is 0, this puts: + // 3 4 5 6 7 8 9 10 in s0_f5 + // Can't use vshr.u64 because it crosses the double word boundary. + s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); + s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); + s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); + s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); + + s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); + s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); + + s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); + s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); + d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); + d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); + + // Keep original src data as 64 bits to simplify shifting and extracting. + s01 = vreinterpretq_u64_u8(s01_f0); + s23 = vreinterpretq_u64_u8(s23_f0); + + // 3 4 5 6 * filter0 + filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); + + // Shift over one to use -1, 0, 1, 2 for filter1 + // -1 0 1 2 * filter1 + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, + &d0123, &d4567); + + // 2 3 4 5 * filter4 + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, + &d0123, &d4567); + + // 0 1 2 3 * filter2 + filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, + &d0123, &d4567); + + // 1 2 3 4 * filter3 + s01_f3 = vshrq_n_u64(s01, 24); + s23_f3 = vshrq_n_u64(s23, 24); + s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), + vreinterpret_u32_u64(vget_high_u64(s01_f3))); + s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), + vreinterpret_u32_u64(vget_high_u64(s23_f3))); + // Accumulate into different registers so it can use saturated addition. + d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); + d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); + + e0123 = + vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); + e4567 = + vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); + + // Shift and narrow. + b0 = vqrshrun_n_s16(e0123, 7); + b2 = vqrshrun_n_s16(e4567, 7); + + if (yoffset == 0) { // firstpass_filter4x4_only + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2)); + return; + } + + // Load additional context when doing both filters. + s0 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s1 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s2 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s3 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s4 = vld1q_u8(src_ptr); + + s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); + s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); + s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); + s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); + s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5); + + // 3 4 5 6 * filter0 + s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); + s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); + + s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); + s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); + // But this time instead of 16 pixels to filter, there are 20. So an extra + // run with a doubleword register. + d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); + d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); + d89 = vmull_u8(s4_f5, filter5); + + // Save a copy as u64 for shifting. + s01 = vreinterpretq_u64_u8(s01_f0); + s23 = vreinterpretq_u64_u8(s23_f0); + + filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); + d89 = vmlal_u8(d89, vget_low_u8(s4), filter0); + + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, + &d0123, &d4567); + s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1); + d89 = vmlsl_u8(d89, s4_f1, filter1); + + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, + &d0123, &d4567); + s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4); + d89 = vmlsl_u8(d89, s4_f4, filter4); + + filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, + &d0123, &d4567); + s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2); + d89 = vmlal_u8(d89, s4_f2, filter2); + + s01_f3 = vshrq_n_u64(s01, 24); + s23_f3 = vshrq_n_u64(s23, 24); + s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), + vreinterpret_u32_u64(vget_high_u64(s01_f3))); + s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), + vreinterpret_u32_u64(vget_high_u64(s23_f3))); + s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3); + d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); + d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); + d89_a = vmull_u8(s4_f3, filter3); + + e0123 = + vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); + e4567 = + vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); + e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a)); + + b4 = vqrshrun_n_s16(e0123, 7); + b6 = vqrshrun_n_s16(e4567, 7); + b8 = vqrshrun_n_s16(e89, 7); + + // Second pass: 4x4 + filter = vld1_u8(abs_filters[yoffset]); + filter0 = vdup_lane_u8(filter, 0); + filter1 = vdup_lane_u8(filter, 1); + filter2 = vdup_lane_u8(filter, 2); + filter3 = vdup_lane_u8(filter, 3); + filter4 = vdup_lane_u8(filter, 4); + filter5 = vdup_lane_u8(filter, 5); + + b1 = vext_u8(b0, b2, 4); + b3 = vext_u8(b2, b4, 4); + b5 = vext_u8(b4, b6, 4); + b7 = vext_u8(b6, b8, 4); + + c0 = vmull_u8(b0, filter0); + c1 = vmull_u8(b2, filter0); + c2 = vmull_u8(b5, filter5); + c3 = vmull_u8(b7, filter5); + + c0 = vmlsl_u8(c0, b4, filter4); + c1 = vmlsl_u8(c1, b6, filter4); + c2 = vmlsl_u8(c2, b1, filter1); + c3 = vmlsl_u8(c3, b3, filter1); + + c0 = vmlal_u8(c0, b2, filter2); + c1 = vmlal_u8(c1, b4, filter2); + c2 = vmlal_u8(c2, b3, filter3); + c3 = vmlal_u8(c3, b5, filter3); + + d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); + d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); + + e0 = vqrshrun_n_s16(d0, 7); + e1 = vqrshrun_n_s16(d1, 7); + + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1)); +} + +void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch) { + unsigned char *src; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8; + + if (xoffset == 0) { // secondpass_filter8x4_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + return; + } + + // First Pass on rest 5-line data + src += src_pixels_per_line; + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x4 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); +} + +void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch) { + unsigned char *src, *tmpp; + unsigned char tmp[64]; + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + + tmpp = tmp; + for (i = 2; i > 0; i--) { + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + dst_ptr += dst_pitch; + } else { + vst1_u8(tmpp, d22u8); + tmpp += 8; + vst1_u8(tmpp, d23u8); + tmpp += 8; + vst1_u8(tmpp, d24u8); + tmpp += 8; + vst1_u8(tmpp, d25u8); + tmpp += 8; + } + } + if (yoffset == 0) return; + + // First Pass on rest 5-line data + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x8 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + tmpp = tmp; + q9u8 = vld1q_u8(tmpp); + tmpp += 16; + q10u8 = vld1q_u8(tmpp); + tmpp += 16; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + q12u8 = vld1q_u8(tmpp); + + d18u8 = vget_low_u8(q9u8); + d19u8 = vget_high_u8(q9u8); + d20u8 = vget_low_u8(q10u8); + d21u8 = vget_high_u8(q10u8); + d22u8 = vget_low_u8(q11u8); + d23u8 = vget_high_u8(q11u8); + d24u8 = vget_low_u8(q12u8); + d25u8 = vget_high_u8(q12u8); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } +} + +void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, + int src_pixels_per_line, int xoffset, + int yoffset, unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *src_tmp, *dst, *tmpp; + unsigned char tmp[336]; + int i, j; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8; + uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8; + uint8x8_t d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint8x16_t q3u8, q4u8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16; + uint16x8_t q11u16, q12u16, q13u16, q15u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16; + int16x8_t q11s16, q12s16, q13s16, q15s16; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src_tmp = src_ptr - src_pixels_per_line * 2; + for (i = 0; i < 2; ++i) { + src = src_tmp + i * 8; + dst = dst_ptr + i * 8; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + for (j = 0; j < 4; ++j) { + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) { // firstpass_filter4x4_only + src = src_ptr - 2; + dst = dst_ptr; + for (i = 0; i < 8; ++i) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + + q6u16 = vmull_u8(d6u8, d0u8); + q7u16 = vmull_u8(d7u8, d0u8); + q8u16 = vmull_u8(d9u8, d0u8); + q9u16 = vmull_u8(d10u8, d0u8); + + d20u8 = vext_u8(d6u8, d7u8, 1); + d21u8 = vext_u8(d9u8, d10u8, 1); + d22u8 = vext_u8(d7u8, d8u8, 1); + d23u8 = vext_u8(d10u8, d11u8, 1); + d24u8 = vext_u8(d6u8, d7u8, 4); + d25u8 = vext_u8(d9u8, d10u8, 4); + d26u8 = vext_u8(d7u8, d8u8, 4); + d27u8 = vext_u8(d10u8, d11u8, 4); + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + + q6u16 = vmlsl_u8(q6u16, d20u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d21u8, d1u8); + q7u16 = vmlsl_u8(q7u16, d22u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d25u8, d4u8); + q7u16 = vmlsl_u8(q7u16, d26u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d27u8, d4u8); + q6u16 = vmlal_u8(q6u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + + d20u8 = vext_u8(d7u8, d8u8, 5); + d21u8 = vext_u8(d10u8, d11u8, 5); + d22u8 = vext_u8(d6u8, d7u8, 2); + d23u8 = vext_u8(d9u8, d10u8, 2); + d24u8 = vext_u8(d7u8, d8u8, 2); + d25u8 = vext_u8(d10u8, d11u8, 2); + d26u8 = vext_u8(d6u8, d7u8, 3); + d27u8 = vext_u8(d9u8, d10u8, 3); + d28u8 = vext_u8(d7u8, d8u8, 3); + d29u8 = vext_u8(d10u8, d11u8, 3); + + q7u16 = vmlal_u8(q7u16, d20u8, d5u8); + q9u16 = vmlal_u8(q9u16, d21u8, d5u8); + q6u16 = vmlal_u8(q6u16, d22u8, d2u8); + q8u16 = vmlal_u8(q8u16, d23u8, d2u8); + q7u16 = vmlal_u8(q7u16, d24u8, d2u8); + q9u16 = vmlal_u8(q9u16, d25u8, d2u8); + + q10u16 = vmull_u8(d26u8, d3u8); + q11u16 = vmull_u8(d27u8, d3u8); + q12u16 = vmull_u8(d28u8, d3u8); + q15u16 = vmull_u8(d29u8, d3u8); + + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q15s16 = vreinterpretq_s16_u16(q15u16); + + q6s16 = vqaddq_s16(q6s16, q10s16); + q8s16 = vqaddq_s16(q8s16, q11s16); + q7s16 = vqaddq_s16(q7s16, q12s16); + q9s16 = vqaddq_s16(q9s16, q15s16); + + d6u8 = vqrshrun_n_s16(q6s16, 7); + d7u8 = vqrshrun_n_s16(q7s16, 7); + d8u8 = vqrshrun_n_s16(q8s16, 7); + d9u8 = vqrshrun_n_s16(q9s16, 7); + + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + vst1q_u8(dst, q3u8); + dst += dst_pitch; + vst1q_u8(dst, q4u8); + dst += dst_pitch; + } + return; + } + + src = src_ptr - 2 - src_pixels_per_line * 2; + tmpp = tmp; + for (i = 0; i < 7; ++i) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d12u8 = vld1_u8(src); + d13u8 = vld1_u8(src + 8); + // Only 5 pixels are needed, avoid a potential out of bounds read. + d14u8 = vld1_u8(src + 13); + d14u8 = vext_u8(d14u8, d14u8, 3); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q8u16 = vmull_u8(d6u8, d0u8); + q9u16 = vmull_u8(d7u8, d0u8); + q10u16 = vmull_u8(d9u8, d0u8); + q11u16 = vmull_u8(d10u8, d0u8); + q12u16 = vmull_u8(d12u8, d0u8); + q13u16 = vmull_u8(d13u8, d0u8); + + d28u8 = vext_u8(d6u8, d7u8, 1); + d29u8 = vext_u8(d9u8, d10u8, 1); + d30u8 = vext_u8(d12u8, d13u8, 1); + q8u16 = vmlsl_u8(q8u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d1u8); + d28u8 = vext_u8(d7u8, d8u8, 1); + d29u8 = vext_u8(d10u8, d11u8, 1); + d30u8 = vext_u8(d13u8, d14u8, 1); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d1u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d1u8); + + d28u8 = vext_u8(d6u8, d7u8, 4); + d29u8 = vext_u8(d9u8, d10u8, 4); + d30u8 = vext_u8(d12u8, d13u8, 4); + q8u16 = vmlsl_u8(q8u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d4u8); + d28u8 = vext_u8(d7u8, d8u8, 4); + d29u8 = vext_u8(d10u8, d11u8, 4); + d30u8 = vext_u8(d13u8, d14u8, 4); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d4u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d4u8); + + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + d30u8 = vext_u8(d12u8, d13u8, 5); + q8u16 = vmlal_u8(q8u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q12u16 = vmlal_u8(q12u16, d30u8, d5u8); + d28u8 = vext_u8(d7u8, d8u8, 5); + d29u8 = vext_u8(d10u8, d11u8, 5); + d30u8 = vext_u8(d13u8, d14u8, 5); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q11u16 = vmlal_u8(q11u16, d29u8, d5u8); + q13u16 = vmlal_u8(q13u16, d30u8, d5u8); + + d28u8 = vext_u8(d6u8, d7u8, 2); + d29u8 = vext_u8(d9u8, d10u8, 2); + d30u8 = vext_u8(d12u8, d13u8, 2); + q8u16 = vmlal_u8(q8u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q12u16 = vmlal_u8(q12u16, d30u8, d2u8); + d28u8 = vext_u8(d7u8, d8u8, 2); + d29u8 = vext_u8(d10u8, d11u8, 2); + d30u8 = vext_u8(d13u8, d14u8, 2); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q11u16 = vmlal_u8(q11u16, d29u8, d2u8); + q13u16 = vmlal_u8(q13u16, d30u8, d2u8); + + d28u8 = vext_u8(d6u8, d7u8, 3); + d29u8 = vext_u8(d9u8, d10u8, 3); + d30u8 = vext_u8(d12u8, d13u8, 3); + d15u8 = vext_u8(d7u8, d8u8, 3); + d31u8 = vext_u8(d10u8, d11u8, 3); + d6u8 = vext_u8(d13u8, d14u8, 3); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q12s16 = vqaddq_s16(q12s16, q6s16); + + q6u16 = vmull_u8(d15u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + q3u16 = vmull_u8(d6u8, d3u8); + q3s16 = vreinterpretq_s16_u16(q3u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q13s16 = vreinterpretq_s16_u16(q13u16); + q9s16 = vqaddq_s16(q9s16, q6s16); + q11s16 = vqaddq_s16(q11s16, q7s16); + q13s16 = vqaddq_s16(q13s16, q3s16); + + d6u8 = vqrshrun_n_s16(q8s16, 7); + d7u8 = vqrshrun_n_s16(q9s16, 7); + d8u8 = vqrshrun_n_s16(q10s16, 7); + d9u8 = vqrshrun_n_s16(q11s16, 7); + d10u8 = vqrshrun_n_s16(q12s16, 7); + d11u8 = vqrshrun_n_s16(q13s16, 7); + + vst1_u8(tmpp, d6u8); + tmpp += 8; + vst1_u8(tmpp, d7u8); + tmpp += 8; + vst1_u8(tmpp, d8u8); + tmpp += 8; + vst1_u8(tmpp, d9u8); + tmpp += 8; + vst1_u8(tmpp, d10u8); + tmpp += 8; + vst1_u8(tmpp, d11u8); + tmpp += 8; + } + + // Second pass: 16x16 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + for (i = 0; i < 2; ++i) { + dst = dst_ptr + 8 * i; + tmpp = tmp + 8 * i; + d18u8 = vld1_u8(tmpp); + tmpp += 16; + d19u8 = vld1_u8(tmpp); + tmpp += 16; + d20u8 = vld1_u8(tmpp); + tmpp += 16; + d21u8 = vld1_u8(tmpp); + tmpp += 16; + d22u8 = vld1_u8(tmpp); + tmpp += 16; + for (j = 0; j < 4; ++j) { + d23u8 = vld1_u8(tmpp); + tmpp += 16; + d24u8 = vld1_u8(tmpp); + tmpp += 16; + d25u8 = vld1_u8(tmpp); + tmpp += 16; + d26u8 = vld1_u8(tmpp); + tmpp += 16; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } +} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c new file mode 100644 index 0000000000..ebc004a048 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "vp8/common/arm/loopfilter_arm.h" +#include "vpx_ports/arm.h" + +static INLINE void vp8_loop_filter_neon(uint8x16_t qblimit, // flimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q10 = vdupq_n_u8(3); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vmovl_u8(vget_low_u8(q10)); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + q0u8 = vdupq_n_u8(0x80); + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8); + return; +} + +void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q5, &q6, &q7, &q8); + + src -= (pitch * 5); + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + return; +} + +void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q5, &q6, &q7, &q8); + + u -= (pitch * 5); + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(u, vget_low_u8(q8)); + + v -= (pitch * 5); + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(v, vget_high_u8(q8)); + return; +} + +static INLINE void write_4x8(unsigned char *dst, int pitch, + const uint8x8x4_t result) { +#ifdef VPX_INCOMPATIBLE_GCC + /* + * uint8x8x4_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + 20 21 22 23 | 24 25 26 27 + 30 31 32 33 | 34 35 36 37 + --- + * after vtrn_u16 + 00 01 20 21 | 04 05 24 25 + 02 03 22 23 | 06 07 26 27 + 10 11 30 31 | 14 15 34 35 + 12 13 32 33 | 16 17 36 37 + --- + * after vtrn_u8 + 00 10 20 30 | 04 14 24 34 + 01 11 21 31 | 05 15 25 35 + 02 12 22 32 | 06 16 26 36 + 03 13 23 33 | 07 17 27 37 + */ + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]), + vreinterpret_u16_u8(result.val[2])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]), + vreinterpret_u16_u8(result.val[3])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]); + const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]); + const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]); + const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]); + vst1_lane_u32((uint32_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 1); +#else + vst4_lane_u8(dst, result, 0); + dst += pitch; + vst4_lane_u8(dst, result, 1); + dst += pitch; + vst4_lane_u8(dst, result, 2); + dst += pitch; + vst4_lane_u8(dst, result, 3); + dst += pitch; + vst4_lane_u8(dst, result, 4); + dst += pitch; + vst4_lane_u8(dst, result, 5); + dst += pitch; + vst4_lane_u8(dst, result, 6); + dst += pitch; + vst4_lane_u8(dst, result, 7); +#endif // VPX_INCOMPATIBLE_GCC +} + +void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s, *d; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s = src - 4; + d6 = vld1_u8(s); + s += pitch; + d8 = vld1_u8(s); + s += pitch; + d10 = vld1_u8(s); + s += pitch; + d12 = vld1_u8(s); + s += pitch; + d14 = vld1_u8(s); + s += pitch; + d16 = vld1_u8(s); + s += pitch; + d18 = vld1_u8(s); + s += pitch; + d20 = vld1_u8(s); + s += pitch; + d7 = vld1_u8(s); + s += pitch; + d9 = vld1_u8(s); + s += pitch; + d11 = vld1_u8(s); + s += pitch; + d13 = vld1_u8(s); + s += pitch; + d15 = vld1_u8(s); + s += pitch; + d17 = vld1_u8(s); + s += pitch; + d19 = vld1_u8(s); + s += pitch; + d21 = vld1_u8(s); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + + d = src - 2; + write_4x8(d, pitch, q4ResultL); + d += pitch * 8; + write_4x8(d, pitch, q4ResultH); +} + +void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + d6 = vld1_u8(us); + us += pitch; + d8 = vld1_u8(us); + us += pitch; + d10 = vld1_u8(us); + us += pitch; + d12 = vld1_u8(us); + us += pitch; + d14 = vld1_u8(us); + us += pitch; + d16 = vld1_u8(us); + us += pitch; + d18 = vld1_u8(us); + us += pitch; + d20 = vld1_u8(us); + + vs = v - 4; + d7 = vld1_u8(vs); + vs += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, + q10, &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + ud = u - 2; + write_4x8(ud, pitch, q4ResultL); + + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + vd = v - 2; + write_4x8(vd, pitch, q4ResultH); +} |