diff options
Diffstat (limited to 'third_party/aom/av1/common/arm/warp_plane_sve.c')
-rw-r--r-- | third_party/aom/av1/common/arm/warp_plane_sve.c | 40 |
1 files changed, 22 insertions, 18 deletions
diff --git a/third_party/aom/av1/common/arm/warp_plane_sve.c b/third_party/aom/av1/common/arm/warp_plane_sve.c index 8a4bf5747b..c70b066174 100644 --- a/third_party/aom/av1/common/arm/warp_plane_sve.c +++ b/third_party/aom/av1/common/arm/warp_plane_sve.c @@ -11,7 +11,7 @@ #include <arm_neon.h> -#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "warp_plane_neon.h" DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = { @@ -20,8 +20,8 @@ DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -48,8 +48,8 @@ static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -86,7 +86,8 @@ static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -115,7 +116,8 @@ static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -152,8 +154,8 @@ static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { return vreinterpretq_s16_u16(res); } -static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, - int sy) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, + int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); int16x4_t s1 = vget_low_s16(src[1]); int16x4_t s2 = vget_low_s16(src[2]); @@ -178,8 +180,9 @@ static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, *res = m0123; } -static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, - int sy, int gamma) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, + int32x4_t *res, int sy, + int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), @@ -200,9 +203,10 @@ static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, *res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); } -static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, + int sy) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; @@ -237,10 +241,10 @@ static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, *res_high = m4567; } -static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy, - int gamma) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; |