summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c')
-rw-r--r--third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c55
1 files changed, 25 insertions, 30 deletions
diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c
index 3aeffbb0e6..40befdf44e 100644
--- a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c
+++ b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c
@@ -80,17 +80,15 @@ static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod(
const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
const int16_t *x_filter_ptr, const int im_h, int w) {
const int bd = 8;
- const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
// Dot product constants and other shims.
const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
- const int32_t correction_s32 =
- vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
- // Fold horiz_const into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
- const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const +
- (1 << ((ROUND0_BITS - 1) - 1)));
+ // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+ // - which are generally faster than rounding shifts on modern CPUs.
+ const int32_t horiz_const =
+ ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+ // Halve the total because we will halve the filter values.
+ const int32x4_t correction =
+ vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
const uint8x16_t range_limit = vdupq_n_u8(128);
const uint8_t *src_ptr = src;
@@ -334,15 +332,14 @@ static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
// Dot-product constants and other shims.
const uint8x16_t range_limit = vdupq_n_u8(128);
- const int32_t correction_s32 =
- vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
// Fold round_offset into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // Halve the total because we will halve the filter values.
int32x4_t correction =
- vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
+ vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) +
+ (1 << (ROUND0_BITS - 1))) /
+ 2);
const int horiz_offset = filter_params_x->taps / 2 - 1;
const uint8_t *src_ptr = src - horiz_offset;
@@ -455,15 +452,14 @@ static INLINE void dist_wtd_convolve_x_avg_neon_dotprod(
// Dot-product constants and other shims.
const uint8x16_t range_limit = vdupq_n_u8(128);
- const int32_t correction_s32 =
- vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
// Fold round_offset into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // Halve the total because we will halve the filter values.
int32x4_t correction =
- vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
+ vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) +
+ (1 << (ROUND0_BITS - 1))) /
+ 2);
const int horiz_offset = filter_params_x->taps / 2 - 1;
const uint8_t *src_ptr = src - horiz_offset;
@@ -574,15 +570,14 @@ static INLINE void dist_wtd_convolve_x_neon_dotprod(
// Dot-product constants and other shims.
const uint8x16_t range_limit = vdupq_n_u8(128);
- const int32_t correction_s32 =
- vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
// Fold round_offset into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // Halve the total because we will halve the vilter values.
int32x4_t correction =
- vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
+ vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) +
+ (1 << (ROUND0_BITS - 1))) /
+ 2);
const int horiz_offset = filter_params_x->taps / 2 - 1;
const uint8_t *src_ptr = src - horiz_offset;