diff options
Diffstat (limited to 'media/libvpx/libvpx/vp9/encoder/x86')
13 files changed, 6066 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c new file mode 100644 index 0000000000..97f182c660 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c @@ -0,0 +1,893 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <smmintrin.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Compute (a-b)**2 for 8 pixels with size 16-bit +static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, + uint32_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu16_epi32(a_reg); + const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero); + const __m128i b_first = _mm_cvtepu16_epi32(b_reg); + const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero); + + __m128i dist_first, dist_second; + + dist_first = _mm_sub_epi32(a_first, b_first); + dist_second = _mm_sub_epi32(a_second, b_second); + dist_first = _mm_mullo_epi32(dist_first, dist_first); + dist_second = _mm_mullo_epi32(dist_second, dist_second); + + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 4), dist_second); +} + +// Sum up three neighboring distortions for the pixels +static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)dist); + dist_left = _mm_loadu_si128((const __m128i *)(dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(dist + 1)); + + *sum = _mm_add_epi32(dist_reg, dist_left); + *sum = _mm_add_epi32(*sum, dist_right); +} + +static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first, + __m128i *sum_second) { + highbd_get_sum_4(dist, sum_first); + highbd_get_sum_4(dist + 4, sum_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values, plus +// however many values from y/uv plane are). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE void highbd_average_4(__m128i *output, const __m128i *sum, + const __m128i *mul_constants, + const int strength, const int rounding, + const int weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u32 = _mm_set1_epi32(rounding); + const __m128i weight_u32 = _mm_set1_epi32(weight); + const __m128i sixteen = _mm_set1_epi32(16); + const __m128i zero = _mm_setzero_si128(); + + // modifier * 3 / index; + const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero); + const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero); + const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero); + const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero); + + const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo); + const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32); + const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi); + const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32); + + // Now we have + // mul_lo: 00 a1 00 a0 + // mul_hi: 00 a3 00 a2 + // Unpack as 64 bit words to get even and odd elements + // unpack_lo: 00 a2 00 a0 + // unpack_hi: 00 a3 00 a1 + // Then we can shift and OR the results to get everything in 32-bits + const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4); + const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift); + + // Round + *output = _mm_add_epi32(mul, rounding_u32); + *output = _mm_srl_epi32(*output, strength_u128); + + // Multiply with the weight + *output = _mm_min_epu32(*output, sixteen); + *output = _mm_sub_epi32(sixteen, *output); + *output = _mm_mullo_epi32(*output, weight_u32); +} + +static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1, + const __m128i *sum_0_u32, + const __m128i *sum_1_u32, + const __m128i *mul_constants_0, + const __m128i *mul_constants_1, + const int strength, const int rounding, + const int weight) { + highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding, + weight); + highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding, + weight); +} + +// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32, + const __m128i sum_second_u32, + const uint16_t *pred, + uint16_t *count, + uint32_t *accumulator) { + // Cast down to 16-bit ints + const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32); + const __m128i zero = _mm_setzero_si128(); + + __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32); + pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} + +static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first, + __m128i *reg_second) { + highbd_read_dist_4(dist, reg_first); + highbd_read_dist_4(dist + 4, reg_second); +} + +static INLINE void highbd_read_chroma_dist_row_8( + int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first, + __m128i *u_second, __m128i *v_first, __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 8 entries from chroma. + highbd_read_dist_8(u_dist, u_first, u_second); + highbd_read_dist_8(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + highbd_read_dist_4(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi32(u_reg, u_reg); + *u_second = _mm_unpackhi_epi32(u_reg, u_reg); + + highbd_read_dist_4(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi32(v_reg, v_reg); + *v_second = _mm_unpackhi_epi32(v_reg, v_reg); + } +} + +static void vp9_highbd_apply_temporal_filter_luma_8( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_first, + const uint32_t *const *neighbors_second, int top_weight, + int bottom_weight) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(block_width == 8); + + (void)block_width; + + // First row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second); + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + // We don't need to saturate here because the maximum value is UINT12_MAX ** 2 + // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX + sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second); + + // Add chroma values + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + weight = bottom_weight; + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, + rounding, weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void vp9_highbd_apply_temporal_filter_luma( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_first; + const uint32_t *const *neighbors_second; + + // Left + neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + // Right + neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); +} + +// Add a row of luma distortion that corresponds to 8 chroma mods. If we are +// subsampling in x direction, then we have 16 lumas, else we have 8. +static INLINE void highbd_add_luma_dist_to_8_chroma_mod( + const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst, + __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) { + __m128i y_reg_fst, y_reg_snd; + if (!ss_x) { + highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst); + y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd); + } + } else { + // Temporary + __m128i y_fst, y_snd; + + // First 8 + highbd_read_dist_8(y_dist, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_fst = _mm_hadd_epi32(y_fst, y_snd); + + // Second 8 + highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_snd = _mm_hadd_epi32(y_fst, y_snd); + } + + *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst); + *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd); + *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst); + *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_highbd_apply_temporal_filter_chroma_8( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int uv_block_width, unsigned int uv_block_height, int ss_x, + int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, + int top_weight, int bottom_weight, const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_fst, mul_snd; + + __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst; + __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst; + __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd; + __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd; + + __m128i u_sum_row_fst, v_sum_row_fst; + __m128i u_sum_row_snd, v_sum_row_snd; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]); + + // Add chroma values + highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + + u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd); + + highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[1]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]); + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); +} + +// Perform temporal filter for the chroma components. +static void vp9_highbd_apply_temporal_filter_chroma( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_fst; + const uint32_t *const *neighbors_snd; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } else { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); +} + +void vp9_highbd_apply_temporal_filter_sse4_1( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + + uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 8) { + highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, + block_height, ss_x, ss_y, strength, + blk_fw, use_whole_blk, y_accum, y_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); + + vp9_highbd_apply_temporal_filter_chroma( + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c new file mode 100644 index 0000000000..7571bfccac --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c @@ -0,0 +1,875 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <smmintrin.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the +// difference squared, and store as unsigned 16-bit integer to dst. +static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a); + const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + + __m128i dist_first; + + dist_first = _mm_sub_epi16(a_first, b_first); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + + _mm_storeu_si128((__m128i *)dst, dist_first); +} + +static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero); + + __m128i dist_first, dist_second; + + dist_first = _mm_sub_epi16(a_first, b_first); + dist_second = _mm_sub_epi16(a_second, b_second); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + dist_second = _mm_mullo_epi16(dist_second, dist_second); + + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 8), dist_second); +} + +static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} + +static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first, + __m128i *reg_second) { + read_dist_8(dist, reg_first); + read_dist_8(dist + 8, reg_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE __m128i average_8(__m128i sum, const __m128i *mul_constants, + const int strength, const int rounding, + const __m128i *weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = *weight; + const __m128i sixteen = _mm_set1_epi16(16); + + // modifier * 3 / index; + sum = _mm_mulhi_epu16(sum, *mul_constants); + + sum = _mm_adds_epu16(sum, rounding_u16); + sum = _mm_srl_epi16(sum, strength_u128); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = _mm_min_epu16(sum, sixteen); + + sum = _mm_sub_epi16(sixteen, sum); + + return _mm_mullo_epi16(sum, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, + uint16_t *count, uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static INLINE void accumulate_and_store_16(const __m128i sum_0_u16, + const __m128i sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), + count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8)); + __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8), + pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero); + __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32; + __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16); + _mm_storeu_si128((__m128i *)count, count_0_u16); + + count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16); + _mm_storeu_si128((__m128i *)(count + 8), count_1_u16); + + pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16); + pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero); + pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16); + pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8)); + accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32); + accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); + _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32); + _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32); +} + +// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int. +static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)y_dist); + dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1)); + + *sum = _mm_adds_epu16(dist_reg, dist_left); + *sum = _mm_adds_epu16(*sum, dist_right); +} + +// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and +// the rest in sum_second. +static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first, + __m128i *sum_second) { + get_sum_8(y_dist, sum_first); + get_sum_8(y_dist + 8, sum_second); +} + +// Read in a row of chroma values corresponds to a row of 16 luma values. +static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, + const uint16_t *v_dist, + __m128i *u_first, __m128i *u_second, + __m128i *v_first, + __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 16 entries from chroma. + read_dist_16(u_dist, u_first, u_second); + read_dist_16(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + read_dist_8(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi16(u_reg, u_reg); + *u_second = _mm_unpackhi_epi16(u_reg, u_reg); + + read_dist_8(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi16(v_reg, v_reg); + *v_second = _mm_unpackhi_epi16(v_reg, v_reg); + } +} + +// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit +// int in dst. +static INLINE void hadd_epu16(__m128i *src, __m128i *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i shift_right = _mm_srli_si128(*src, 2); + + const __m128i odd = _mm_blend_epi16(shift_right, zero, 170); + const __m128i even = _mm_blend_epi16(*src, zero, 170); + + *dst = _mm_add_epi32(even, odd); +} + +// Add a row of luma distortion to 8 corresponding chroma mods. +static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, + int ss_x, int ss_y, + __m128i *u_mod, + __m128i *v_mod) { + __m128i y_reg; + if (!ss_x) { + read_dist_8(y_dist, &y_reg); + if (ss_y == 1) { + __m128i y_tmp; + read_dist_8(y_dist + DIST_STRIDE, &y_tmp); + + y_reg = _mm_adds_epu16(y_reg, y_tmp); + } + } else { + __m128i y_first, y_second; + read_dist_16(y_dist, &y_first, &y_second); + if (ss_y == 1) { + __m128i y_tmp_0, y_tmp_1; + read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1); + + y_first = _mm_adds_epu16(y_first, y_tmp_0); + y_second = _mm_adds_epu16(y_second, y_tmp_1); + } + + hadd_epu16(&y_first, &y_first); + hadd_epu16(&y_second, &y_second); + + y_reg = _mm_packus_epi32(y_first, y_second); + } + + *u_mod = _mm_adds_epu16(*u_mod, y_reg); + *v_mod = _mm_adds_epu16(*v_mod, y_reg); +} + +// Apply temporal filter to the luma components. This performs temporal +// filtering on a luma block of 16 X block_height. Use blk_fw as an array of +// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_apply_temporal_filter_luma_16( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors_first, + const int16_t *const *neighbors_second, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + __m128i weight_first, weight_second; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 0); + assert(strength <= 6); + + assert(block_width == 16); + (void)block_width; + + // Initialize the weights + if (blk_fw) { + weight_first = _mm_set1_epi16(blk_fw[0]); + weight_second = _mm_set1_epi16(blk_fw[1]); + } else { + weight_first = _mm_set1_epi16(top_weight); + weight_second = weight_first; + } + + // First row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second); + + // Add chroma values + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + if (blk_fw) { + weight_first = _mm_set1_epi16(blk_fw[2]); + weight_second = _mm_set1_epi16(blk_fw[3]); + } else { + weight_first = _mm_set1_epi16(bottom_weight); + weight_second = weight_first; + } + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void vp9_apply_temporal_filter_luma( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors_first; + const int16_t *const *neighbors_second; + + if (block_width == 16) { + // Special Case: The blockwidth is 16 and we are operating on a row of 16 + // chroma pixels. In this case, we can't use the usual left-middle-right + // pattern. We also don't support splitting now. + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + if (use_whole_blk) { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } else { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, 0, 0, blk_fw); + } + + return; + } + + // Left + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + // Right + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_apply_temporal_filter_chroma_8( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + + __m128i weight; + + __m128i mul; + + __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3; + __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3; + + __m128i u_sum_row, v_sum_row; + + // Loop variable + unsigned int h; + + // Initialize weight + if (blk_fw) { + weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0], + blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]); + } else { + weight = _mm_set1_epi16(top_weight); + } + + // First row + mul = _mm_load_si128((const __m128i *)neighbors[0]); + + // Add chroma values + get_sum_8(u_dist, &u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + + u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3); + + get_sum_8(v_dist, &v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul = _mm_load_si128((const __m128i *)neighbors[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + weight = _mm_setr_epi16(blk_fw[2], blk_fw[2], blk_fw[2], blk_fw[2], + blk_fw[3], blk_fw[3], blk_fw[3], blk_fw[3]); + } else { + weight = _mm_set1_epi16(bottom_weight); + } + } + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul = _mm_load_si128((const __m128i *)neighbors[0]); + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); +} + +// Perform temporal filter for the chroma components. +static void vp9_apply_temporal_filter_chroma( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } else { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + } + + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); +} + +void vp9_apply_temporal_filter_sse4_1( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + const int *blk_fw_ptr = blk_fw; + + uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 16) { + store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + vp9_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height, + ss_x, ss_y, strength, blk_fw_ptr, + use_whole_blk, y_accum, y_count, y_dist_ptr, + u_dist_ptr, v_dist_ptr); + + vp9_apply_temporal_filter_chroma( + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c new file mode 100644 index 0000000000..e9943447fd --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -0,0 +1,1537 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/fwd_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" +#include "vpx_ports/mem.h" + +static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, + int stride) { + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + __m128i mask; + + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + + in[0] = _mm_slli_epi16(in[0], 4); + in[1] = _mm_slli_epi16(in[1], 4); + in[2] = _mm_slli_epi16(in[2], 4); + in[3] = _mm_slli_epi16(in[3], 4); + + mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); + in[0] = _mm_add_epi16(in[0], mask); + in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); +} + +static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) { + const __m128i kOne = _mm_set1_epi16(1); + __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); + __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); + __m128i out01 = _mm_add_epi16(in01, kOne); + __m128i out23 = _mm_add_epi16(in23, kOne); + out01 = _mm_srai_epi16(out01, 2); + out23 = _mm_srai_epi16(out23, 2); + store_output(&out01, (output + 0 * 8)); + store_output(&out23, (output + 1 * 8)); +} + +static INLINE void transpose_4x4(__m128i *res) { + // Combine and transpose + // 00 01 02 03 20 21 22 23 + // 10 11 12 13 30 31 32 33 + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); + res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); + + // 00 10 20 30 01 11 21 31 + // 02 12 22 32 03 13 23 33 + // only use the first 4 16-bit integers + res[1] = _mm_unpackhi_epi64(res[0], res[0]); + res[3] = _mm_unpackhi_epi64(res[2], res[2]); +} + +static void fdct4_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u[4], v[4]; + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpacklo_epi16(in[3], in[2]); + + v[0] = _mm_add_epi16(u[0], u[1]); + v[1] = _mm_sub_epi16(u[0], u[1]); + + u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 + u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 + u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 + u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); + transpose_4x4(in); +} + +static void fadst4_sse2(__m128i *in) { + const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); + const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); + const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); + const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); + const __m128i kZero = _mm_setzero_si128(); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8]; + __m128i in7 = _mm_add_epi16(in[0], in[1]); + + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpacklo_epi16(in[2], in[3]); + u[2] = _mm_unpacklo_epi16(in7, kZero); + u[3] = _mm_unpacklo_epi16(in[2], kZero); + u[4] = _mm_unpacklo_epi16(in[3], kZero); + + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 + v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 + v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_sub_epi32(v[2], v[6]); + u[2] = _mm_add_epi32(v[3], v[4]); + u[3] = _mm_sub_epi32(u[2], u[0]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_sub_epi32(u[4], v[5]); + u[6] = _mm_add_epi32(u[3], u[5]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[2]); + in[1] = _mm_packs_epi32(u[1], u[3]); + transpose_4x4(in); +} + +void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[4]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break; + case ADST_DCT: + load_buffer_4x4(input, in, stride); + fadst4_sse2(in); + fdct4_sse2(in); + write_buffer_4x4(output, in); + break; + case DCT_ADST: + load_buffer_4x4(input, in, stride); + fdct4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_4x4(input, in, stride); + fadst4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + } +} + +// load 8x8 array +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, + int stride) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + in[0] = _mm_slli_epi16(in[0], 2); + in[1] = _mm_slli_epi16(in[1], 2); + in[2] = _mm_slli_epi16(in[2], 2); + in[3] = _mm_slli_epi16(in[3], 2); + in[4] = _mm_slli_epi16(in[4], 2); + in[5] = _mm_slli_epi16(in[5], 2); + in[6] = _mm_slli_epi16(in[6], 2); + in[7] = _mm_slli_epi16(in[7], 2); +} + +// right shift and rounding +static INLINE void right_shift_8x8(__m128i *res, const int bit) { + __m128i sign0 = _mm_srai_epi16(res[0], 15); + __m128i sign1 = _mm_srai_epi16(res[1], 15); + __m128i sign2 = _mm_srai_epi16(res[2], 15); + __m128i sign3 = _mm_srai_epi16(res[3], 15); + __m128i sign4 = _mm_srai_epi16(res[4], 15); + __m128i sign5 = _mm_srai_epi16(res[5], 15); + __m128i sign6 = _mm_srai_epi16(res[6], 15); + __m128i sign7 = _mm_srai_epi16(res[7], 15); + + if (bit == 2) { + const __m128i const_rounding = _mm_set1_epi16(1); + res[0] = _mm_add_epi16(res[0], const_rounding); + res[1] = _mm_add_epi16(res[1], const_rounding); + res[2] = _mm_add_epi16(res[2], const_rounding); + res[3] = _mm_add_epi16(res[3], const_rounding); + res[4] = _mm_add_epi16(res[4], const_rounding); + res[5] = _mm_add_epi16(res[5], const_rounding); + res[6] = _mm_add_epi16(res[6], const_rounding); + res[7] = _mm_add_epi16(res[7], const_rounding); + } + + res[0] = _mm_sub_epi16(res[0], sign0); + res[1] = _mm_sub_epi16(res[1], sign1); + res[2] = _mm_sub_epi16(res[2], sign2); + res[3] = _mm_sub_epi16(res[3], sign3); + res[4] = _mm_sub_epi16(res[4], sign4); + res[5] = _mm_sub_epi16(res[5], sign5); + res[6] = _mm_sub_epi16(res[6], sign6); + res[7] = _mm_sub_epi16(res[7], sign7); + + if (bit == 1) { + res[0] = _mm_srai_epi16(res[0], 1); + res[1] = _mm_srai_epi16(res[1], 1); + res[2] = _mm_srai_epi16(res[2], 1); + res[3] = _mm_srai_epi16(res[3], 1); + res[4] = _mm_srai_epi16(res[4], 1); + res[5] = _mm_srai_epi16(res[5], 1); + res[6] = _mm_srai_epi16(res[6], 1); + res[7] = _mm_srai_epi16(res[7], 1); + } else { + res[0] = _mm_srai_epi16(res[0], 2); + res[1] = _mm_srai_epi16(res[1], 2); + res[2] = _mm_srai_epi16(res[2], 2); + res[3] = _mm_srai_epi16(res[3], 2); + res[4] = _mm_srai_epi16(res[4], 2); + res[5] = _mm_srai_epi16(res[5], 2); + res[6] = _mm_srai_epi16(res[6], 2); + res[7] = _mm_srai_epi16(res[7], 2); + } +} + +// write 8x8 array +static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, + int stride) { + store_output(&res[0], (output + 0 * stride)); + store_output(&res[1], (output + 1 * stride)); + store_output(&res[2], (output + 2 * stride)); + store_output(&res[3], (output + 3 * stride)); + store_output(&res[4], (output + 4 * stride)); + store_output(&res[5], (output + 5 * stride)); + store_output(&res[6], (output + 6 * stride)); + store_output(&res[7], (output + 7 * stride)); +} + +static void fdct8_sse2(__m128i *in) { + // constants + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + + // stage 1 + s0 = _mm_add_epi16(in[0], in[7]); + s1 = _mm_add_epi16(in[1], in[6]); + s2 = _mm_add_epi16(in[2], in[5]); + s3 = _mm_add_epi16(in[3], in[4]); + s4 = _mm_sub_epi16(in[3], in[4]); + s5 = _mm_sub_epi16(in[2], in[5]); + s6 = _mm_sub_epi16(in[1], in[6]); + s7 = _mm_sub_epi16(in[0], in[7]); + + u0 = _mm_add_epi16(s0, s3); + u1 = _mm_add_epi16(s1, s2); + u2 = _mm_sub_epi16(s1, s2); + u3 = _mm_sub_epi16(s0, s3); + // interleave and perform butterfly multiplication/addition + v0 = _mm_unpacklo_epi16(u0, u1); + v1 = _mm_unpackhi_epi16(u0, u1); + v2 = _mm_unpacklo_epi16(u2, u3); + v3 = _mm_unpackhi_epi16(u2, u3); + + u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); + u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); + u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); + u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); + u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); + u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); + u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); + u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); + + // shift and rounding + v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u0, u1); + in[2] = _mm_packs_epi32(u4, u5); + in[4] = _mm_packs_epi32(u2, u3); + in[6] = _mm_packs_epi32(u6, u7); + + // stage 2 + // interleave and perform butterfly multiplication/addition + u0 = _mm_unpacklo_epi16(s6, s5); + u1 = _mm_unpackhi_epi16(s6, s5); + v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); + + // shift and rounding + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + + u0 = _mm_packs_epi32(v0, v1); + u1 = _mm_packs_epi32(v2, v3); + + // stage 3 + s0 = _mm_add_epi16(s4, u0); + s1 = _mm_sub_epi16(s4, u0); + s2 = _mm_sub_epi16(s7, u1); + s3 = _mm_add_epi16(s7, u1); + + // stage 4 + u0 = _mm_unpacklo_epi16(s0, s3); + u1 = _mm_unpackhi_epi16(s0, s3); + u2 = _mm_unpacklo_epi16(s1, s2); + u3 = _mm_unpackhi_epi16(s1, s2); + + v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); + v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); + v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); + v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); + v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); + v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); + v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); + v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); + + // shift and rounding + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + in[1] = _mm_packs_epi32(v0, v1); + in[3] = _mm_packs_epi32(v4, v5); + in[5] = _mm_packs_epi32(v2, v3); + in[7] = _mm_packs_epi32(v6, v7); + + // transpose + transpose_16bit_8x8(in, in); +} + +static void fadst8_sse2(__m128i *in) { + // Constants + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__const_0 = _mm_setzero_si128(); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + // properly aligned for butterfly input + in0 = in[7]; + in1 = in[0]; + in2 = in[5]; + in3 = in[2]; + in4 = in[3]; + in5 = in[4]; + in6 = in[1]; + in7 = in[6]; + + // column transformation + // stage 1 + // interleave and multiply/add into 32-bit integer + s0 = _mm_unpacklo_epi16(in0, in1); + s1 = _mm_unpackhi_epi16(in0, in1); + s2 = _mm_unpacklo_epi16(in2, in3); + s3 = _mm_unpackhi_epi16(in2, in3); + s4 = _mm_unpacklo_epi16(in4, in5); + s5 = _mm_unpackhi_epi16(in4, in5); + s6 = _mm_unpacklo_epi16(in6, in7); + s7 = _mm_unpackhi_epi16(in6, in7); + + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + + // addition + w0 = _mm_add_epi32(u0, u8); + w1 = _mm_add_epi32(u1, u9); + w2 = _mm_add_epi32(u2, u10); + w3 = _mm_add_epi32(u3, u11); + w4 = _mm_add_epi32(u4, u12); + w5 = _mm_add_epi32(u5, u13); + w6 = _mm_add_epi32(u6, u14); + w7 = _mm_add_epi32(u7, u15); + w8 = _mm_sub_epi32(u0, u8); + w9 = _mm_sub_epi32(u1, u9); + w10 = _mm_sub_epi32(u2, u10); + w11 = _mm_sub_epi32(u3, u11); + w12 = _mm_sub_epi32(u4, u12); + w13 = _mm_sub_epi32(u5, u13); + w14 = _mm_sub_epi32(u6, u14); + w15 = _mm_sub_epi32(u7, u15); + + // shift and rounding + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + + // back to 16-bit and pack 8 integers into __m128i + in[0] = _mm_packs_epi32(u0, u1); + in[1] = _mm_packs_epi32(u2, u3); + in[2] = _mm_packs_epi32(u4, u5); + in[3] = _mm_packs_epi32(u6, u7); + in[4] = _mm_packs_epi32(u8, u9); + in[5] = _mm_packs_epi32(u10, u11); + in[6] = _mm_packs_epi32(u12, u13); + in[7] = _mm_packs_epi32(u14, u15); + + // stage 2 + s0 = _mm_add_epi16(in[0], in[2]); + s1 = _mm_add_epi16(in[1], in[3]); + s2 = _mm_sub_epi16(in[0], in[2]); + s3 = _mm_sub_epi16(in[1], in[3]); + u0 = _mm_unpacklo_epi16(in[4], in[5]); + u1 = _mm_unpackhi_epi16(in[4], in[5]); + u2 = _mm_unpacklo_epi16(in[6], in[7]); + u3 = _mm_unpackhi_epi16(in[6], in[7]); + + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + + w0 = _mm_add_epi32(v0, v4); + w1 = _mm_add_epi32(v1, v5); + w2 = _mm_add_epi32(v2, v6); + w3 = _mm_add_epi32(v3, v7); + w4 = _mm_sub_epi32(v0, v4); + w5 = _mm_sub_epi32(v1, v5); + w6 = _mm_sub_epi32(v2, v6); + w7 = _mm_sub_epi32(v3, v7); + + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + // back to 16-bit intergers + s4 = _mm_packs_epi32(u0, u1); + s5 = _mm_packs_epi32(u2, u3); + s6 = _mm_packs_epi32(u4, u5); + s7 = _mm_packs_epi32(u6, u7); + + // stage 3 + u0 = _mm_unpacklo_epi16(s2, s3); + u1 = _mm_unpackhi_epi16(s2, s3); + u2 = _mm_unpacklo_epi16(s6, s7); + u3 = _mm_unpackhi_epi16(s6, s7); + + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); + + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + s2 = _mm_packs_epi32(v0, v1); + s3 = _mm_packs_epi32(v2, v3); + s6 = _mm_packs_epi32(v4, v5); + s7 = _mm_packs_epi32(v6, v7); + + // FIXME(jingning): do subtract using bit inversion? + in[0] = s0; + in[1] = _mm_sub_epi16(k__const_0, s4); + in[2] = s6; + in[3] = _mm_sub_epi16(k__const_0, s2); + in[4] = s3; + in[5] = _mm_sub_epi16(k__const_0, s7); + in[6] = s5; + in[7] = _mm_sub_epi16(k__const_0, s1); + + // transpose + transpose_16bit_8x8(in, in); +} + +void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[8]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break; + case ADST_DCT: + load_buffer_8x8(input, in, stride); + fadst8_sse2(in); + fdct8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case DCT_ADST: + load_buffer_8x8(input, in, stride); + fdct8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_8x8(input, in, stride); + fadst8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + } +} + +static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0, + __m128i *in1, int stride) { + // load first 8 columns + load_buffer_8x8(input, in0, stride); + load_buffer_8x8(input + 8 * stride, in0 + 8, stride); + + input += 8; + // load second 8 columns + load_buffer_8x8(input, in1, stride); + load_buffer_8x8(input + 8 * stride, in1 + 8, stride); +} + +static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, + __m128i *in1, int stride) { + // write first 8 columns + write_buffer_8x8(output, in0, stride); + write_buffer_8x8(output + 8 * stride, in0 + 8, stride); + // write second 8 columns + output += 8; + write_buffer_8x8(output, in1, stride); + write_buffer_8x8(output + 8 * stride, in1 + 8, stride); +} + +static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { + // perform rounding operations + right_shift_8x8(res0, 2); + right_shift_8x8(res0 + 8, 2); + right_shift_8x8(res1, 2); + right_shift_8x8(res1 + 8, 2); +} + +static void fdct16_8col(__m128i *in) { + // perform 16x16 1-D DCT for 8 columns + __m128i i[8], s[8], p[8], t[8], u[16], v[16]; + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + // stage 1 + i[0] = _mm_add_epi16(in[0], in[15]); + i[1] = _mm_add_epi16(in[1], in[14]); + i[2] = _mm_add_epi16(in[2], in[13]); + i[3] = _mm_add_epi16(in[3], in[12]); + i[4] = _mm_add_epi16(in[4], in[11]); + i[5] = _mm_add_epi16(in[5], in[10]); + i[6] = _mm_add_epi16(in[6], in[9]); + i[7] = _mm_add_epi16(in[7], in[8]); + + s[0] = _mm_sub_epi16(in[7], in[8]); + s[1] = _mm_sub_epi16(in[6], in[9]); + s[2] = _mm_sub_epi16(in[5], in[10]); + s[3] = _mm_sub_epi16(in[4], in[11]); + s[4] = _mm_sub_epi16(in[3], in[12]); + s[5] = _mm_sub_epi16(in[2], in[13]); + s[6] = _mm_sub_epi16(in[1], in[14]); + s[7] = _mm_sub_epi16(in[0], in[15]); + + p[0] = _mm_add_epi16(i[0], i[7]); + p[1] = _mm_add_epi16(i[1], i[6]); + p[2] = _mm_add_epi16(i[2], i[5]); + p[3] = _mm_add_epi16(i[3], i[4]); + p[4] = _mm_sub_epi16(i[3], i[4]); + p[5] = _mm_sub_epi16(i[2], i[5]); + p[6] = _mm_sub_epi16(i[1], i[6]); + p[7] = _mm_sub_epi16(i[0], i[7]); + + u[0] = _mm_add_epi16(p[0], p[3]); + u[1] = _mm_add_epi16(p[1], p[2]); + u[2] = _mm_sub_epi16(p[1], p[2]); + u[3] = _mm_sub_epi16(p[0], p[3]); + + v[0] = _mm_unpacklo_epi16(u[0], u[1]); + v[1] = _mm_unpackhi_epi16(u[0], u[1]); + v[2] = _mm_unpacklo_epi16(u[2], u[3]); + v[3] = _mm_unpackhi_epi16(u[2], u[3]); + + u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); + u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); + u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); + u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); + u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); + u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); + u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); + u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[4] = _mm_packs_epi32(u[4], u[5]); + in[8] = _mm_packs_epi32(u[2], u[3]); + in[12] = _mm_packs_epi32(u[6], u[7]); + + u[0] = _mm_unpacklo_epi16(p[5], p[6]); + u[1] = _mm_unpackhi_epi16(p[5], p[6]); + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[2], v[3]); + + t[0] = _mm_add_epi16(p[4], u[0]); + t[1] = _mm_sub_epi16(p[4], u[0]); + t[2] = _mm_sub_epi16(p[7], u[1]); + t[3] = _mm_add_epi16(p[7], u[1]); + + u[0] = _mm_unpacklo_epi16(t[0], t[3]); + u[1] = _mm_unpackhi_epi16(t[0], t[3]); + u[2] = _mm_unpacklo_epi16(t[1], t[2]); + u[3] = _mm_unpackhi_epi16(t[1], t[2]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); + v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); + v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); + v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); + v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); + v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + in[2] = _mm_packs_epi32(v[0], v[1]); + in[6] = _mm_packs_epi32(v[4], v[5]); + in[10] = _mm_packs_epi32(v[2], v[3]); + in[14] = _mm_packs_epi32(v[6], v[7]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[2], s[5]); + u[1] = _mm_unpackhi_epi16(s[2], s[5]); + u[2] = _mm_unpacklo_epi16(s[3], s[4]); + u[3] = _mm_unpackhi_epi16(s[3], s[4]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[2] = _mm_packs_epi32(v[0], v[1]); + t[3] = _mm_packs_epi32(v[2], v[3]); + t[4] = _mm_packs_epi32(v[4], v[5]); + t[5] = _mm_packs_epi32(v[6], v[7]); + + // stage 3 + p[0] = _mm_add_epi16(s[0], t[3]); + p[1] = _mm_add_epi16(s[1], t[2]); + p[2] = _mm_sub_epi16(s[1], t[2]); + p[3] = _mm_sub_epi16(s[0], t[3]); + p[4] = _mm_sub_epi16(s[7], t[4]); + p[5] = _mm_sub_epi16(s[6], t[5]); + p[6] = _mm_add_epi16(s[6], t[5]); + p[7] = _mm_add_epi16(s[7], t[4]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(p[1], p[6]); + u[1] = _mm_unpackhi_epi16(p[1], p[6]); + u[2] = _mm_unpacklo_epi16(p[2], p[5]); + u[3] = _mm_unpackhi_epi16(p[2], p[5]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); + v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); + v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); + v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); + v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); + v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); + v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[1] = _mm_packs_epi32(v[0], v[1]); + t[2] = _mm_packs_epi32(v[2], v[3]); + t[5] = _mm_packs_epi32(v[4], v[5]); + t[6] = _mm_packs_epi32(v[6], v[7]); + + // stage 5 + s[0] = _mm_add_epi16(p[0], t[1]); + s[1] = _mm_sub_epi16(p[0], t[1]); + s[2] = _mm_add_epi16(p[3], t[2]); + s[3] = _mm_sub_epi16(p[3], t[2]); + s[4] = _mm_sub_epi16(p[4], t[5]); + s[5] = _mm_add_epi16(p[4], t[5]); + s[6] = _mm_sub_epi16(p[7], t[6]); + s[7] = _mm_add_epi16(p[7], t[6]); + + // stage 6 + u[0] = _mm_unpacklo_epi16(s[0], s[7]); + u[1] = _mm_unpackhi_epi16(s[0], s[7]); + u[2] = _mm_unpacklo_epi16(s[1], s[6]); + u[3] = _mm_unpackhi_epi16(s[1], s[6]); + u[4] = _mm_unpacklo_epi16(s[2], s[5]); + u[5] = _mm_unpackhi_epi16(s[2], s[5]); + u[6] = _mm_unpacklo_epi16(s[3], s[4]); + u[7] = _mm_unpackhi_epi16(s[3], s[4]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); + v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); + v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); + v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); + v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); + v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); + v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); + v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); + v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); + v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); + v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); + v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); + v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); + v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[1] = _mm_packs_epi32(v[0], v[1]); + in[9] = _mm_packs_epi32(v[2], v[3]); + in[5] = _mm_packs_epi32(v[4], v[5]); + in[13] = _mm_packs_epi32(v[6], v[7]); + in[3] = _mm_packs_epi32(v[8], v[9]); + in[11] = _mm_packs_epi32(v[10], v[11]); + in[7] = _mm_packs_epi32(v[12], v[13]); + in[15] = _mm_packs_epi32(v[14], v[15]); +} + +static void fadst16_8col(__m128i *in) { + // perform 16x16 1-D ADST for 8 columns + __m128i s[16], x[16], u[32], v[32]; + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_setzero_si128(); + + u[0] = _mm_unpacklo_epi16(in[15], in[0]); + u[1] = _mm_unpackhi_epi16(in[15], in[0]); + u[2] = _mm_unpacklo_epi16(in[13], in[2]); + u[3] = _mm_unpackhi_epi16(in[13], in[2]); + u[4] = _mm_unpacklo_epi16(in[11], in[4]); + u[5] = _mm_unpackhi_epi16(in[11], in[4]); + u[6] = _mm_unpacklo_epi16(in[9], in[6]); + u[7] = _mm_unpackhi_epi16(in[9], in[6]); + u[8] = _mm_unpacklo_epi16(in[7], in[8]); + u[9] = _mm_unpackhi_epi16(in[7], in[8]); + u[10] = _mm_unpacklo_epi16(in[5], in[10]); + u[11] = _mm_unpackhi_epi16(in[5], in[10]); + u[12] = _mm_unpacklo_epi16(in[3], in[12]); + u[13] = _mm_unpackhi_epi16(in[3], in[12]); + u[14] = _mm_unpacklo_epi16(in[1], in[14]); + u[15] = _mm_unpackhi_epi16(in[1], in[14]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); + + u[0] = _mm_add_epi32(v[0], v[16]); + u[1] = _mm_add_epi32(v[1], v[17]); + u[2] = _mm_add_epi32(v[2], v[18]); + u[3] = _mm_add_epi32(v[3], v[19]); + u[4] = _mm_add_epi32(v[4], v[20]); + u[5] = _mm_add_epi32(v[5], v[21]); + u[6] = _mm_add_epi32(v[6], v[22]); + u[7] = _mm_add_epi32(v[7], v[23]); + u[8] = _mm_add_epi32(v[8], v[24]); + u[9] = _mm_add_epi32(v[9], v[25]); + u[10] = _mm_add_epi32(v[10], v[26]); + u[11] = _mm_add_epi32(v[11], v[27]); + u[12] = _mm_add_epi32(v[12], v[28]); + u[13] = _mm_add_epi32(v[13], v[29]); + u[14] = _mm_add_epi32(v[14], v[30]); + u[15] = _mm_add_epi32(v[15], v[31]); + u[16] = _mm_sub_epi32(v[0], v[16]); + u[17] = _mm_sub_epi32(v[1], v[17]); + u[18] = _mm_sub_epi32(v[2], v[18]); + u[19] = _mm_sub_epi32(v[3], v[19]); + u[20] = _mm_sub_epi32(v[4], v[20]); + u[21] = _mm_sub_epi32(v[5], v[21]); + u[22] = _mm_sub_epi32(v[6], v[22]); + u[23] = _mm_sub_epi32(v[7], v[23]); + u[24] = _mm_sub_epi32(v[8], v[24]); + u[25] = _mm_sub_epi32(v[9], v[25]); + u[26] = _mm_sub_epi32(v[10], v[26]); + u[27] = _mm_sub_epi32(v[11], v[27]); + u[28] = _mm_sub_epi32(v[12], v[28]); + u[29] = _mm_sub_epi32(v[13], v[29]); + u[30] = _mm_sub_epi32(v[14], v[30]); + u[31] = _mm_sub_epi32(v[15], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_packs_epi32(u[8], u[9]); + s[5] = _mm_packs_epi32(u[10], u[11]); + s[6] = _mm_packs_epi32(u[12], u[13]); + s[7] = _mm_packs_epi32(u[14], u[15]); + s[8] = _mm_packs_epi32(u[16], u[17]); + s[9] = _mm_packs_epi32(u[18], u[19]); + s[10] = _mm_packs_epi32(u[20], u[21]); + s[11] = _mm_packs_epi32(u[22], u[23]); + s[12] = _mm_packs_epi32(u[24], u[25]); + s[13] = _mm_packs_epi32(u[26], u[27]); + s[14] = _mm_packs_epi32(u[28], u[29]); + s[15] = _mm_packs_epi32(u[30], u[31]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); + u[1] = _mm_unpackhi_epi16(s[8], s[9]); + u[2] = _mm_unpacklo_epi16(s[10], s[11]); + u[3] = _mm_unpackhi_epi16(s[10], s[11]); + u[4] = _mm_unpacklo_epi16(s[12], s[13]); + u[5] = _mm_unpackhi_epi16(s[12], s[13]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + x[0] = _mm_add_epi16(s[0], s[4]); + x[1] = _mm_add_epi16(s[1], s[5]); + x[2] = _mm_add_epi16(s[2], s[6]); + x[3] = _mm_add_epi16(s[3], s[7]); + x[4] = _mm_sub_epi16(s[0], s[4]); + x[5] = _mm_sub_epi16(s[1], s[5]); + x[6] = _mm_sub_epi16(s[2], s[6]); + x[7] = _mm_sub_epi16(s[3], s[7]); + x[8] = _mm_packs_epi32(u[0], u[1]); + x[9] = _mm_packs_epi32(u[2], u[3]); + x[10] = _mm_packs_epi32(u[4], u[5]); + x[11] = _mm_packs_epi32(u[6], u[7]); + x[12] = _mm_packs_epi32(u[8], u[9]); + x[13] = _mm_packs_epi32(u[10], u[11]); + x[14] = _mm_packs_epi32(u[12], u[13]); + x[15] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); + u[1] = _mm_unpackhi_epi16(x[4], x[5]); + u[2] = _mm_unpacklo_epi16(x[6], x[7]); + u[3] = _mm_unpackhi_epi16(x[6], x[7]); + u[4] = _mm_unpacklo_epi16(x[12], x[13]); + u[5] = _mm_unpackhi_epi16(x[12], x[13]); + u[6] = _mm_unpacklo_epi16(x[14], x[15]); + u[7] = _mm_unpackhi_epi16(x[14], x[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_add_epi16(x[0], x[2]); + s[1] = _mm_add_epi16(x[1], x[3]); + s[2] = _mm_sub_epi16(x[0], x[2]); + s[3] = _mm_sub_epi16(x[1], x[3]); + s[4] = _mm_packs_epi32(v[0], v[1]); + s[5] = _mm_packs_epi32(v[2], v[3]); + s[6] = _mm_packs_epi32(v[4], v[5]); + s[7] = _mm_packs_epi32(v[6], v[7]); + s[8] = _mm_add_epi16(x[8], x[10]); + s[9] = _mm_add_epi16(x[9], x[11]); + s[10] = _mm_sub_epi16(x[8], x[10]); + s[11] = _mm_sub_epi16(x[9], x[11]); + s[12] = _mm_packs_epi32(v[8], v[9]); + s[13] = _mm_packs_epi32(v[10], v[11]); + s[14] = _mm_packs_epi32(v[12], v[13]); + s[15] = _mm_packs_epi32(v[14], v[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(s[10], s[11]); + u[5] = _mm_unpackhi_epi16(s[10], s[11]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[8]); + in[2] = s[12]; + in[3] = _mm_sub_epi16(kZero, s[4]); + in[4] = _mm_packs_epi32(v[4], v[5]); + in[5] = _mm_packs_epi32(v[12], v[13]); + in[6] = _mm_packs_epi32(v[8], v[9]); + in[7] = _mm_packs_epi32(v[0], v[1]); + in[8] = _mm_packs_epi32(v[2], v[3]); + in[9] = _mm_packs_epi32(v[10], v[11]); + in[10] = _mm_packs_epi32(v[14], v[15]); + in[11] = _mm_packs_epi32(v[6], v[7]); + in[12] = s[5]; + in[13] = _mm_sub_epi16(kZero, s[13]); + in[14] = s[9]; + in[15] = _mm_sub_epi16(kZero, s[1]); +} + +static void fdct16_sse2(__m128i *in0, __m128i *in1) { + fdct16_8col(in0); + fdct16_8col(in1); + transpose_16bit_16x16(in0, in1); +} + +static void fadst16_sse2(__m128i *in0, __m128i *in1) { + fadst16_8col(in0); + fadst16_8col(in1); + transpose_16bit_16x16(in0, in1); +} + +void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in0[16], in1[16]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break; + case ADST_DCT: + load_buffer_16x16(input, in0, in1, stride); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdct16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case DCT_ADST: + load_buffer_16x16(input, in0, in1, stride); + fdct16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_16x16(input, in0, in1, stride); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm new file mode 100644 index 0000000000..8152dce864 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm @@ -0,0 +1,69 @@ +; +; Copyright (c) 2016 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define private_prefix vp9 + +%include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + ; 00 01 02 03 + ; 10 11 12 13 + ; 20 21 22 23 + ; 30 31 32 33 + punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13 + punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33 + mova m1, m0 + punpckldq m0, m2 ; 00 10 20 30 01 11 21 31 + punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33 +%endmacro + +INIT_XMM sse2 +cglobal fwht4x4, 3, 4, 8, input, output, stride + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + SWAP 1, 2 + psrldq m1, m0, 8 + psrldq m3, m2, 8 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + + STORE_TRAN_LOW 0, outputq, 0, 2, 3 + STORE_TRAN_LOW 1, outputq, 8, 2, 3 + + RET diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c new file mode 100644 index 0000000000..5930bf491e --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" +#include "vpx_mem/vpx_mem.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int sum_diff_16x1(__m128i acc_diff) { + const __m128i k_1 = _mm_set1_epi16(1); + const __m128i acc_diff_lo = + _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_hi = + _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); + const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); + const __m128i hgfe_dcba = + _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = + _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); + return _mm_cvtsi128_si32(hgfedcba); +} + +// Denoise a 16x1 vector. +static INLINE __m128i vp9_denoiser_16x1_sse2( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const __m128i *k_0, const __m128i *k_4, const __m128i *k_8, + const __m128i *k_16, const __m128i *l3, const __m128i *l32, + const __m128i *l21, __m128i acc_diff) { + // Calculate differences + const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); + __m128i v_running_avg_y; + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0); + // Clamp absolute difference to 16 to be used to get mask. Doing this + // allows us to use _mm_cmpgt_epi8, which operates on signed byte. + const __m128i clamped_absdiff = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16); + // Get masks for l2 l1 and l0 adjustments. + const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff); + // Get adjustments for l2, l1, and l0. + __m128i adj2 = _mm_and_si128(mask2, *l32); + const __m128i adj1 = _mm_and_si128(mask1, *l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + + // Combine the adjustments and get absolute adjustments. + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(*l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + // Restore the sign and get positive and negative adjustments. + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + // Calculate filtered value. + v_running_avg_y = _mm_adds_epu8(v_sig, padj); + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Adjustments <=7, and each element in acc_diff can fit in signed + // char. + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + return acc_diff; +} + +// Denoise a 16x1 vector with a weaker filter. +static INLINE __m128i vp9_denoiser_adj_16x1_sse2( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const __m128i k_0, const __m128i k_delta, __m128i acc_diff) { + __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); + // Calculate differences. + const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); + v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + return acc_diff; +} + +// Denoise 8x8 and 8x16 blocks. +static int vp9_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude, int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + // Modify each level's adjustment according to motion_magnitude. + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); + // Difference between level 3 and level 2 is 2. + const __m128i l32 = _mm_set1_epi8(2); + // Difference between level 2 and level 1 is 1. + const __m128i l21 = _mm_set1_epi8(1); + const int b_height = (4 << b_height_log2_lookup[bs]) >> 1; + + for (r = 0; r < b_height; ++r) { + memcpy(sig_buffer[r], sig, width); + memcpy(sig_buffer[r] + width, sig + sig_stride, width); + memcpy(mc_running_buffer[r], mc_running_avg_y, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); + memcpy(running_buffer[r], running_avg_y, width); + memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); + acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r], + running_buffer[r], &k_0, &k_4, &k_8, + &k_16, &l3, &l32, &l21, acc_diff); + memcpy(running_avg_y, running_buffer[r], width); + memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width); + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = sum_diff_16x1(acc_diff); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { + acc_diff = vp9_denoiser_adj_16x1_sse2( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0, + k_delta, acc_diff); + memcpy(running_avg_y, running_buffer[r], width); + memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, + width); + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = sum_diff_16x1(acc_diff); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks. +static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + int sum_diff_thresh, r, c, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + __m128i acc_diff[4][4]; + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + // Modify each level's adjustment according to motion_magnitude. + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); + // Difference between level 3 and level 2 is 2. + const __m128i l32 = _mm_set1_epi8(2); + // Difference between level 2 and level 1 is 1. + const __m128i l21 = _mm_set1_epi8(1); + const int b_width = (4 << b_width_log2_lookup[bs]); + const int b_height = (4 << b_height_log2_lookup[bs]); + const int b_width_shift4 = b_width >> 4; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r] = _mm_setzero_si128(); + } + } + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r >> 4] = vp9_denoiser_16x1_sse2( + sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3, + &l32, &l21, acc_diff[c][r >> 4]); + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); + } + } + + // Update pointers for next iteration. + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + { + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; + sum_diff = 0; + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r >> 4] = + vp9_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y, + k_0, k_delta, acc_diff[c][r >> 4]); + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); + } + } + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + // Rank by frequency of the block type to have an early termination. + if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || + bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || + bs == BLOCK_32X64 || bs == BLOCK_64X32) { + return vp9_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride, + avg, avg_stride, increase_denoising, bs, + motion_magnitude); + } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return vp9_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride, + avg, avg_stride, increase_denoising, bs, + motion_magnitude, 8); + } else { + return COPY_BLOCK; + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c new file mode 100644 index 0000000000..80442e3594 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#if defined(_MSC_VER) +#include <intrin.h> +#endif +#include <emmintrin.h> +#include <smmintrin.h> + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vpx_ports/mem.h" + +#ifdef __GNUC__ +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) +#else +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) +#endif + +static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { + int_mv result; + result.as_mv.row = row; + result.as_mv.col = col; + return result; +} +/***************************************************************************** + * This function utilizes 3 properties of the cost function lookup tables, * + * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * + * vp9_encoder.c. * + * For the joint cost: * + * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * + * For the component costs: * + * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * + * (Equal costs for both components) * + * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * + * (Cost function is even) * + * If these do not hold, then this function cannot be used without * + * modification, in which case you can revert to using the C implementation, * + * which does not rely on these properties. * + *****************************************************************************/ +int vp9_diamond_search_sad_avx(const MACROBLOCK *x, + const search_site_config *cfg, MV *ref_mv, + uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_sad_fn_ptr_t *sad_fn_ptr, + const MV *center_mv) { + const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); + const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int); + const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); + const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int); + + const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); + + const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); + const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; + const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; + const int tot_steps = cfg->total_steps - search_param; + + const int_mv fcenter_mv = + pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); + const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int); + + const int ref_row = ref_mv->row; + const int ref_col = ref_mv->col; + + int_mv bmv = pack_int_mv(ref_row, ref_col); + int_mv new_bmv = bmv; + __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int); + + const int what_stride = x->plane[0].src.stride; + const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; + const uint8_t *const what = x->plane[0].src.buf; + const uint8_t *const in_what = + x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; + + // Work out the start point for the search + const uint8_t *best_address = in_what; + const uint8_t *new_best_address = best_address; +#if VPX_ARCH_X86_64 + __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); +#else + __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); +#endif + // Starting position + unsigned int best_sad = start_mv_sad; + int i, j, step; + + // Check the prerequisite cost function properties that are easy to check + // in an assert. See the function-level documentation for details on all + // prerequisites. + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); + + *num00 = 0; + + for (i = 0, step = 0; step < tot_steps; step++) { + for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { + __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w; +#if VPX_ARCH_X86_64 + __m128i v_blocka[2]; +#else + __m128i v_blocka[1]; +#endif + + // Compute the candidate motion vectors + const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]); + const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); + // Clamp them to the search bounds + __m128i v_these_mv_clamp_w = v_these_mv_w; + v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); + v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); + // The ones that did not change are inside the search area + v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); + + // If none of them are inside, then move on + if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) { + continue; + } + + // The inverse mask indicates which of the MVs are outside + v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8((int8_t)0xff)); + // Shift right to keep the sign bit clear, we will use this later + // to set the cost to the maximum value. + v_outside_d = _mm_srli_epi32(v_outside_d, 1); + + // Compute the difference MV + v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); + // We utilise the fact that the cost function is even, and use the + // absolute difference. This allows us to use unsigned indexes later + // and reduces cache pressure somewhat as only a half of the table + // is ever referenced. + v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); + + // Compute the SIMD pointer offsets. + { +#if VPX_ARCH_X86_64 // sizeof(intptr_t) == 8 + // Load the offsets + __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]); + __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]); + // Set the ones falling outside to zero + v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); + v_bo32_q = + _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); + // Compute the candidate addresses + v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); + v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); +#else // VPX_ARCH_X86 // sizeof(intptr_t) == 4 + __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]); + v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); + v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); +#endif + } + + sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); + + // Look up the component cost of the residual motion vector + { + const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); + const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); + const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); + const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); + const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); + const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); + const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); + const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); + + // Note: This is a use case for vpgather in AVX2 + const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; + const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; + const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; + const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; + + __m128i v_cost_10_d, v_cost_32_d; + v_cost_10_d = _mm_cvtsi32_si128(cost0); + v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); + v_cost_32_d = _mm_cvtsi32_si128(cost2); + v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); + v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); + } + + // Now add in the joint cost + { + const __m128i v_sel_d = + _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); + const __m128i v_joint_cost_d = + _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); + v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); + } + + // Multiply by sad_per_bit + v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); + // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT) + v_cost_d = _mm_add_epi32(v_cost_d, + _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1))); + v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT); + // Add the cost to the sad + v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); + + // Make the motion vectors outside the search area have max cost + // by or'ing in the comparison mask, this way the minimum search won't + // pick them. + v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); + + // Find the minimum value and index horizontally in v_sad_d + { + // Try speculatively on 16 bits, so we can use the minpos intrinsic + const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); + const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); + + uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); + uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); + + // If the local best value is not saturated, just use it, otherwise + // find the horizontal minimum again the hard way on 32 bits. + // This is executed rarely. + if (UNLIKELY(local_best_sad == 0xffff)) { + __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; + + v_loval_d = v_sad_d; + v_loidx_d = _mm_set_epi32(3, 2, 1, 0); + v_hival_d = _mm_srli_si128(v_loval_d, 8); + v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); + + v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); + + v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); + v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); + v_hival_d = _mm_srli_si128(v_loval_d, 4); + v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); + + v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); + + v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); + v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); + + local_best_sad = _mm_extract_epi32(v_loval_d, 0); + local_best_idx = _mm_extract_epi32(v_loidx_d, 0); + } + + // Update the global minimum if the local minimum is smaller + if (LIKELY(local_best_sad < best_sad)) { +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; + + best_sad = local_best_sad; + } + } + } + + bmv = new_bmv; + best_address = new_best_address; + + v_bmv_w = _mm_set1_epi32((int)bmv.as_int); +#if VPX_ARCH_X86_64 + v_ba_q = _mm_set1_epi64x((intptr_t)best_address); +#else + v_ba_d = _mm_set1_epi32((intptr_t)best_address); +#endif + + if (UNLIKELY(best_address == in_what)) { + (*num00)++; + } + } + + *best_mv = bmv.as_mv; + return best_sad; +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c new file mode 100644 index 0000000000..99fef31d16 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" + +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + __m256i sse_256, ssz_256; + __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; + __m256i sse_hi, ssz_hi; + __m128i sse_128, ssz_128; + int64_t sse; + const __m256i zero = _mm256_setzero_si256(); + + // If the block size is 16 then the results will fit in 32 bits. + if (block_size == 16) { + __m256i coeff_256, dqcoeff_256, coeff_hi, dqcoeff_hi; + // Load 16 elements for coeff and dqcoeff. + coeff_256 = load_tran_low(coeff); + dqcoeff_256 = load_tran_low(dqcoeff); + // dqcoeff - coeff + dqcoeff_256 = _mm256_sub_epi16(dqcoeff_256, coeff_256); + // madd (dqcoeff - coeff) + dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256); + // madd coeff + coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256); + // Save the higher 64 bit of each 128 bit lane. + dqcoeff_hi = _mm256_srli_si256(dqcoeff_256, 8); + coeff_hi = _mm256_srli_si256(coeff_256, 8); + // Add the higher 64 bit to the low 64 bit. + dqcoeff_256 = _mm256_add_epi32(dqcoeff_256, dqcoeff_hi); + coeff_256 = _mm256_add_epi32(coeff_256, coeff_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(dqcoeff_256, zero); + ssz_256 = _mm256_unpacklo_epi32(coeff_256, zero); + } else { + int i; + assert(block_size % 32 == 0); + sse_256 = zero; + ssz_256 = zero; + + for (i = 0; i < block_size; i += 32) { + __m256i coeff_0, coeff_1, dqcoeff_0, dqcoeff_1; + // Load 32 elements for coeff and dqcoeff. + coeff_0 = load_tran_low(coeff + i); + dqcoeff_0 = load_tran_low(dqcoeff + i); + coeff_1 = load_tran_low(coeff + i + 16); + dqcoeff_1 = load_tran_low(dqcoeff + i + 16); + // dqcoeff - coeff + dqcoeff_0 = _mm256_sub_epi16(dqcoeff_0, coeff_0); + dqcoeff_1 = _mm256_sub_epi16(dqcoeff_1, coeff_1); + // madd (dqcoeff - coeff) + dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0); + dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1); + // madd coeff + coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0); + coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1); + // Add the first madd (dqcoeff - coeff) with the second. + dqcoeff_0 = _mm256_add_epi32(dqcoeff_0, dqcoeff_1); + // Add the first madd (coeff) with the second. + coeff_0 = _mm256_add_epi32(coeff_0, coeff_1); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_0, zero); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_0, zero); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_0, zero); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_0, zero); + // Add each quad word of madd (dqcoeff - coeff) and madd (coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_lo); + ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_hi); + ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_hi); + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + ssz_hi = _mm256_srli_si256(ssz_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + ssz_256 = _mm256_add_epi64(ssz_256, ssz_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + ssz_128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_256), + _mm256_extractf128_si256(ssz_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)(&sse), sse_128); + + _mm_storel_epi64((__m128i *)(ssz), ssz_128); + return sse; +} + +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + int i; + const __m256i zero = _mm256_setzero_si256(); + __m256i sse_256 = zero; + __m256i sse_hi; + __m128i sse_128; + int64_t sse; + + if (block_size == 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + // dqcoeff - coeff + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + // madd (dqcoeff - coeff) + const __m256i error_lo = _mm256_madd_epi16(diff, diff); + // Save the higher 64 bit of each 128 bit lane. + const __m256i error_hi = _mm256_srli_si256(error_lo, 8); + // Add the higher 64 bit to the low 64 bit. + const __m256i error = _mm256_add_epi32(error_lo, error_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(error, zero); + } else { + for (i = 0; i < block_size; i += 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + const __m256i error = _mm256_madd_epi16(diff, diff); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero); + const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero); + // Add each quad word of madd (dqcoeff - coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_error_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_error_hi); + coeff += 16; + dqcoeff += 16; + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)&sse, sse_128); + return sse; +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm new file mode 100644 index 0000000000..7beec130ab --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm @@ -0,0 +1,115 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define private_prefix vp9 + +%include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" + +SECTION .text + +; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, +; int64_t *ssz) + +INIT_XMM sse2 +cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz + pxor m4, m4 ; sse accumulator + pxor m6, m6 ; ssz accumulator + pxor m5, m5 ; dedicated zero register +.loop: + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 + paddd m2, m3 + ; accumulate in 64bit + punpckldq m7, m0, m5 + punpckhdq m0, m5 + paddq m4, m7 + punpckldq m7, m2, m5 + paddq m4, m0 + punpckhdq m2, m5 + paddq m6, m7 + paddq m6, m2 + jg .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + movhlps m7, m6 + paddq m4, m5 + paddq m6, m7 +%if VPX_ARCH_X86_64 + movq rax, m4 + movq [sszq], m6 +%else + mov eax, sszm + pshufd m5, m4, 0x1 + movq [eax], m6 + movd eax, m4 + movd edx, m5 +%endif + RET + +; Compute the sum of squared difference between two tran_low_t vectors. +; Vectors are converted (if necessary) to int16_t for calculations. +; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff, +; intptr_t block_size) + +INIT_XMM sse2 +cglobal block_error_fp, 3, 3, 6, uqc, dqc, size + pxor m4, m4 ; sse accumulator + pxor m5, m5 ; dedicated zero register +.loop: + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 + ; accumulate in 64bit + punpckldq m3, m0, m5 + punpckhdq m0, m5 + paddq m4, m3 + paddq m4, m0 + jnz .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + paddq m4, m5 +%if VPX_ARCH_X86_64 + movq rax, m4 +%else + pshufd m5, m4, 0x1 + movd eax, m4 + movd edx, m5 +%endif + RET diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c new file mode 100644 index 0000000000..94506aad0f --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -0,0 +1,907 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <tmmintrin.h> // SSSE3 + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vpx_dsp/x86/convolve_ssse3.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_scale/yv12config.h" + +static INLINE __m128i scale_plane_2_to_1_phase_0_kernel( + const uint8_t *const src, const __m128i *const mask) { + const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0])); + const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16])); + const __m128i a_and = _mm_and_si128(a, *mask); + const __m128i b_and = _mm_and_si128(b, *mask); + return _mm_packus_epi16(a_and, b_and); +} + +static void scale_plane_2_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; + const __m128i mask = _mm_set1_epi16(0x00FF); + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask); + _mm_storeu_si128((__m128i *)dst, d); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; + const __m128i mask = _mm_set1_epi32(0x000000FF); + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask); + const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask); + const __m128i d2 = _mm_packus_epi16(d0, d1); + _mm_storeu_si128((__m128i *)dst, d2); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s, + const __m128i c0c1) { + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1); + const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1); + // round and shift by 7 bit each 16 bit + const __m128i t2 = _mm_adds_epi16(t0, k_64); + const __m128i t3 = _mm_adds_epi16(t1, k_64); + const __m128i t4 = _mm_srai_epi16(t2, 7); + const __m128i t5 = _mm_srai_epi16(t3, 7); + return _mm_packus_epi16(t4, t5); +} + +static void scale_plane_2_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[2], d[2]; + + // Horizontal + // Even rows + s[0] = _mm_loadu_si128((const __m128i *)(src + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + 16)); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + // odd rows + s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + d[1] = scale_plane_bilinear_kernel(s, c0c1); + + // Vertical + s[0] = _mm_unpacklo_epi8(d[0], d[1]); + s[1] = _mm_unpackhi_epi8(d[0], d[1]); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[8], d[8]; + + // Note: Using _mm_packus_epi32() in SSE4.1 could be faster. + // Here we tried to not use shuffle instructions which would be slow + // on some x86 CPUs. + + // Horizontal + // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx + // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx + // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx + // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx + // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx + // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx + // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx + // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx + s[0] = _mm_loadu_si128((const __m128i *)(&src[0])); + s[1] = _mm_loadu_si128((const __m128i *)(&src[16])); + s[2] = _mm_loadu_si128((const __m128i *)(&src[32])); + s[3] = _mm_loadu_si128((const __m128i *)(&src[48])); + s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32)); + s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48)); + + // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx + // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx + // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx + // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx + // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx + // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx + // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx + // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx + d[0] = _mm_unpacklo_epi16(s[0], s[4]); + d[1] = _mm_unpackhi_epi16(s[0], s[4]); + d[2] = _mm_unpacklo_epi16(s[1], s[5]); + d[3] = _mm_unpackhi_epi16(s[1], s[5]); + d[4] = _mm_unpacklo_epi16(s[2], s[6]); + d[5] = _mm_unpackhi_epi16(s[2], s[6]); + d[6] = _mm_unpacklo_epi16(s[3], s[7]); + d[7] = _mm_unpackhi_epi16(s[3], s[7]); + + // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx + // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx + // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx + // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx + // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx + // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx + // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx + // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx + s[0] = _mm_unpacklo_epi32(d[0], d[1]); + s[1] = _mm_unpackhi_epi32(d[0], d[1]); + s[2] = _mm_unpacklo_epi32(d[2], d[3]); + s[3] = _mm_unpackhi_epi32(d[2], d[3]); + s[4] = _mm_unpacklo_epi32(d[4], d[5]); + s[5] = _mm_unpackhi_epi32(d[4], d[5]); + s[6] = _mm_unpacklo_epi32(d[6], d[7]); + s[7] = _mm_unpackhi_epi32(d[6], d[7]); + + // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D + // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D + // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D + // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D + d[0] = _mm_unpacklo_epi32(s[0], s[1]); + d[1] = _mm_unpacklo_epi32(s[2], s[3]); + d[2] = _mm_unpacklo_epi32(s[4], s[5]); + d[3] = _mm_unpacklo_epi32(s[6], s[7]); + + d[0] = scale_plane_bilinear_kernel(&d[0], c0c1); + d[1] = scale_plane_bilinear_kernel(&d[2], c0c1); + + // Vertical + d[0] = scale_plane_bilinear_kernel(d, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + do { + load_8bit_8x8(src + 2, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[3]); + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + transpose_16bit_4x8(&s[3], &s[3]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71 + d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72 + d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73 + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + d[0] = _mm_packus_epi16(d[0], d[2]); + d[1] = _mm_packus_epi16(d[1], d[3]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + d[2] = _mm_unpacklo_epi16(d[0], d[1]); + d[3] = _mm_unpackhi_epi16(d[0], d[1]); + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + d[0] = _mm_unpacklo_epi32(d[2], d[3]); + d[1] = _mm_unpackhi_epi32(d[2], d[3]); + store_8bit_8x4_from_16x2(d, t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + t += 8; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor)); + t += 6 * width_hor; + y = height_ver; + + do { + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[3]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17 + d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27 + d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[1] = _mm_packus_epi16(d[2], d[3]); + store_8bit_8x4_from_16x2(d, dst, dst_stride); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + do { + load_8bit_8x8(src + 4, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped) + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[2]); + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + transpose_16bit_4x8(&s[2], &s[2]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71 + + // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx + // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + d[0] = _mm_packus_epi16(d[0], d[0]); + d[1] = _mm_packus_epi16(d[1], d[1]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + d[0] = _mm_unpacklo_epi16(d[0], d[1]); + store_8bit_4x4_sse2(d[0], t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + + t += 4; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + t += 4 * width_hor; + y = height_ver; + + do { + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[2]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + d[0] = _mm_packus_epi16(d[0], d[1]); + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + + s[0] = s[4]; + s[1] = s[5]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +typedef void (*shuffle_filter_funcs)(const int16_t *const filter, + __m128i *const f); + +typedef __m128i (*convolve8_funcs)(const __m128i *const s, + const __m128i *const f); + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[12], d[6], dd[4]; + __m128i f0[4], f1[5], f2[5]; + // The offset of the first row is always less than 1 pixel. + const int offset1_q4 = phase_scaler + 1 * step_q4; + const int offset2_q4 = phase_scaler + 2 * step_q4; + // offset_idxx indicates the pixel offset is even (0) or odd (1). + // It's used to choose the src offset and filter coefficient offset. + const int offset_idx1 = (offset1_q4 >> 4) & 1; + const int offset_idx2 = (offset2_q4 >> 4) & 1; + static const shuffle_filter_funcs kShuffleFilterFuncs[2] = { + shuffle_filter_ssse3, shuffle_filter_odd_ssse3 + }; + static const convolve8_funcs kConvolve8Funcs[2] = { + convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 + }; + + assert(w && h); + + shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0); + kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); + kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); + + // Sub 64 to avoid overflow. + // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. + // Coef 128 is in either fx[1] or fx[2] depending on the phase idx. + // When filter phase idx is 1, the two biggest coefficients are shuffled + // together, and the sum of them are always no less than 128. Sub 64 here. + // After the subtraction, when the sum of all positive coefficients are no + // larger than 128, and the sum of all negative coefficients are no + // less than -128, there will be no overflow in the convolve8 functions. + f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64)); + f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64)); + f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64)); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1; + + // horizontal 6x8 + do { + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[4]); + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F + transpose_16bit_4x8(&s[4], &s[4]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx + // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx + dd[0] = _mm_packus_epi16(d[0], d[2]); + dd[1] = _mm_packus_epi16(d[1], d[3]); + dd[2] = _mm_packus_epi16(d[4], d[4]); + dd[3] = _mm_packus_epi16(d[5], d[5]); + + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75 + d[0] = _mm_unpacklo_epi16(dd[0], dd[1]); + d[1] = _mm_unpackhi_epi16(dd[0], dd[1]); + d[2] = _mm_unpacklo_epi16(dd[2], dd[3]); + + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx + // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx + dd[0] = _mm_unpacklo_epi32(d[0], d[1]); + dd[1] = _mm_unpackhi_epi32(d[0], d[1]); + dd[2] = _mm_unpacklo_epi32(d[2], d[2]); + dd[3] = _mm_unpackhi_epi32(d[2], d[2]); + + // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx + // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx + // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx + // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx + d[0] = _mm_unpacklo_epi64(dd[0], dd[2]); + d[1] = _mm_unpackhi_epi64(dd[0], dd[2]); + d[2] = _mm_unpacklo_epi64(dd[1], dd[3]); + d[3] = _mm_unpackhi_epi64(dd[1], dd[3]); + + // store 4 extra pixels + storeu_8bit_16x4(d, t, stride_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + t += 12; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 3 * stride_hor + 4; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + loadu_8bit_16x4(t, stride_hor, s); + y = height_ver; + + do { + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7 + // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7 + t += 4 * stride_hor; + loadu_8bit_16x4(t, stride_hor, &s[4]); + + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[2] = _mm_packus_epi16(d[2], d[3]); + d[4] = _mm_packus_epi16(d[4], d[5]); + + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]); + _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]); + _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * 2 * height_ver / 3; + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s, + const __m128i *const f) { + __m128i ss[4], temp; + + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + temp = convolve8_8_ssse3(ss, f); + return _mm_packus_epi16(temp, temp); +} + +// Only calculate odd columns since even columns are just src pixels' copies. +static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst, + const int w, const __m128i *const f) { + int x = w; + + do { + __m128i s[8], temp; + s[0] = _mm_loadl_epi64((const __m128i *)(src + 0)); + s[1] = _mm_loadl_epi64((const __m128i *)(src + 1)); + s[2] = _mm_loadl_epi64((const __m128i *)(src + 2)); + s[3] = _mm_loadl_epi64((const __m128i *)(src + 3)); + s[4] = _mm_loadl_epi64((const __m128i *)(src + 4)); + s[5] = _mm_loadl_epi64((const __m128i *)(src + 5)); + s[6] = _mm_loadl_epi64((const __m128i *)(src + 6)); + s[7] = _mm_loadl_epi64((const __m128i *)(src + 7)); + temp = scale_1_to_2_phase_0_kernel(s, f); + _mm_storel_epi64((__m128i *)dst, temp); + src += 8; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_1_to_2_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int src_w, const int src_h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + int max_width; + int y; + uint8_t *tmp[9]; + __m128i f[4]; + + max_width = (src_w + 7) & ~7; + tmp[0] = temp_buffer + 0 * max_width; + tmp[1] = temp_buffer + 1 * max_width; + tmp[2] = temp_buffer + 2 * max_width; + tmp[3] = temp_buffer + 3 * max_width; + tmp[4] = temp_buffer + 4 * max_width; + tmp[5] = temp_buffer + 5 * max_width; + tmp[6] = temp_buffer + 6 * max_width; + tmp[7] = temp_buffer + 7 * max_width; + + shuffle_filter_ssse3(coef, f); + + scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f); + scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f); + scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f); + scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f); + scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f); + scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f); + scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f); + + y = src_h; + do { + int x; + scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f); + for (x = 0; x < max_width; x += 8) { + __m128i s[8], C, D, CD; + + // Even rows + const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x)); + const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + const __m128i ab = _mm_unpacklo_epi8(a, b); + _mm_storeu_si128((__m128i *)(dst + 2 * x), ab); + + // Odd rows + // Even columns + load_8bit_8x8(src + x - 3 * src_stride, src_stride, s); + C = scale_1_to_2_phase_0_kernel(s, f); + + // Odd columns + s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x)); + s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x)); + s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x)); + s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x)); + s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x)); + s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x)); + s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x)); + D = scale_1_to_2_phase_0_kernel(s, f); + + CD = _mm_unpacklo_epi8(C, D); + _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD); + } + + src += src_stride; + dst += 2 * dst_stride; + tmp[8] = tmp[0]; + tmp[0] = tmp[1]; + tmp[1] = tmp[2]; + tmp[2] = tmp[3]; + tmp[3] = tmp[4]; + tmp[4] = tmp[5]; + tmp[5] = tmp[6]; + tmp[6] = tmp[7]; + tmp[7] = tmp[8]; + } while (--y); +} + +void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + uint8_t filter_type, int phase_scaler) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + const int dst_uv_w = dst->uv_crop_width; + const int dst_uv_h = dst->uv_crop_height; + int scaled = 0; + + // phase_scaler is usually 0 or 8. + assert(phase_scaler >= 0 && phase_scaler < 16); + + if (dst_w * 2 == src_w && dst_h * 2 == src_h) { + // 2 to 1 + scaled = 1; + + if (phase_scaler == 0) { + scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0c1); + scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + } else { + const int buffer_stride = (dst_w + 3) & ~3; + const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_2_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_2_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_2_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + // 4 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0c1); + scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + } else { + const int buffer_stride = (dst_w + 1) & ~1; + const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow + const int extra_padding = 16; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding); + if (temp_buffer) { + scale_plane_4_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_4_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_4_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2; + const int buffer_stride_ver = (dst_w + 7) & ~7; + const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + // When the vertical filter reads more pixels than the horizontal filter + // generated in each row, we need extra padding to avoid heap read overflow. + // For example, the horizontal filter generates 18 pixels but the vertical + // filter reads 24 pixels in a row. The difference is multiplied by 2 since + // two rows are interlaced together in the optimization. + const int extra_padding = (buffer_stride_ver > buffer_stride_hor) + ? 2 * (buffer_stride_ver - buffer_stride_hor) + : 0; + const int buffer_size = buffer_stride_hor * buffer_height + extra_padding; + uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size); + if (temp_buffer) { + scaled = 1; + scale_plane_4_to_3_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + free(temp_buffer); + } + } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) { + // 1 to 2 + uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7)); + if (temp_buffer) { + scaled = 1; + scale_plane_1_to_2_phase_0( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w, + src_h, vp9_filter_kernels[filter_type][8], temp_buffer); + scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, src_w / 2, src_h / 2, + vp9_filter_kernels[filter_type][8], + temp_buffer); + scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, src_w / 2, src_h / 2, + vp9_filter_kernels[filter_type][8], + temp_buffer); + free(temp_buffer); + } + } + + if (scaled) { + vpx_extend_frame_borders(dst); + } else { + // Call c version for all other scaling ratios. + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c new file mode 100644 index 0000000000..d7aafe7b01 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> +#include <stdio.h> + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" + +int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bd) { + int i, j, test; + uint32_t temp[4]; + __m128i max, min, cmp0, cmp1, cmp2, cmp3; + int64_t error = 0, sqcoeff = 0; + const int shift = 2 * (bd - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i += 8) { + // Load the data into xmm registers + __m128i mm_coeff = _mm_load_si128((const __m128i *)(coeff + i)); + __m128i mm_coeff2 = _mm_load_si128((const __m128i *)(coeff + i + 4)); + __m128i mm_dqcoeff = _mm_load_si128((const __m128i *)(dqcoeff + i)); + __m128i mm_dqcoeff2 = _mm_load_si128((const __m128i *)(dqcoeff + i + 4)); + // Check if any values require more than 15 bit + max = _mm_set1_epi32(0x3fff); + min = _mm_set1_epi32((int32_t)0xffffc000); + cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), + _mm_cmplt_epi32(mm_coeff, min)); + cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), + _mm_cmplt_epi32(mm_coeff2, min)); + cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), + _mm_cmplt_epi32(mm_dqcoeff, min)); + cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), + _mm_cmplt_epi32(mm_dqcoeff2, min)); + test = _mm_movemask_epi8( + _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); + + if (!test) { + __m128i mm_diff, error_sse2, sqcoeff_sse2; + mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); + mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); + mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); + error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); + sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); + _mm_storeu_si128((__m128i *)temp, error_sse2); + error = error + temp[0] + temp[1] + temp[2] + temp[3]; + _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); + sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; + } else { + for (j = 0; j < 8; j++) { + const int64_t diff = coeff[i + j] - dqcoeff[i + j]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; + } + } + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c new file mode 100644 index 0000000000..e6aa71d58a --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c @@ -0,0 +1,441 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <immintrin.h> // AVX2 + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" +#include "vpx_dsp/x86/quantize_sse2.h" + +// Zero fill 8 positions in the output buffer. +static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) { + const __m256i zero = _mm256_setzero_si256(); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(a), zero); + _mm256_storeu_si256((__m256i *)(a + 8), zero); +#else + _mm256_storeu_si256((__m256i *)(a), zero); +#endif +} + +static VPX_FORCE_INLINE void load_fp_values_avx2( + const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) { + *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); +#else + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); +#endif + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) { + const __m256i eob_lo = eob256; + // Copy upper 128 to lower 128 + const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81); + __m256i eob = _mm256_max_epi16(eob_lo, eob_hi); + __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); +#if defined(_MSC_VER) && (_MSC_VER < 1910) + return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff; +#else + return (uint16_t)_mm256_extract_epi16(eob, 0); +#endif +} + +static VPX_FORCE_INLINE void quantize_fp_16( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const int32_t nzflag = + _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr)); + + if (nzflag) { + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + } else { + store_zero_tran_low(qcoeff_ptr); + store_zero_tran_low(dqcoeff_ptr); + } +} + +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + (void)scan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + thr = _mm256_setzero_si256(); + + quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += 8 * 2; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_srai_epi16(dequant, 1); + + // AC only loop + while (n_coeffs < 0) { + quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += 8 * 2; + } + + *eob_ptr = get_max_eob(eob_max); +} + +// Enable this flag when matching the optimized code to +// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the +// existing ssse3 code and quantize_fp_32x32_nz_c(). +// +// #define MATCH_VP9_QUANTIZE_FP_32X32_C + +#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C +static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i abs_dqcoeff = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); + const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + (void)thr; +} +#endif + +static VPX_FORCE_INLINE void quantize_fp_32x32_16( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + const int32_t nzflag = _mm256_movemask_epi8(thr_mask); + + if (nzflag) { +#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask); +#else + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); +#endif + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i abs_dqcoeff = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); + const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + } else { + store_zero_tran_low(qcoeff_ptr); + store_zero_tran_low(dqcoeff_ptr); + } +} + +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + (void)scan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + thr = _mm256_srli_epi16(dequant, 2); + quant = _mm256_slli_epi16(quant, 1); + { + const __m256i rnd = _mm256_set1_epi16((int16_t)1); + round = _mm256_add_epi16(round, rnd); + round = _mm256_srai_epi16(round, 1); + } + +#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. + thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1)); + quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); +#else + quantize_fp_32x32_16_no_nzflag( + &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, + qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); +#endif + + n_coeffs += 8 * 2; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_permute2x128_si256(thr, thr, 0x31); + + // AC only loop + while (n_coeffs < 0) { + quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += 8 * 2; + } + + *eob_ptr = get_max_eob(eob_max); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) { + const __m128i v = _mm_load_si128((const __m128i *)val_ptr); + const __m128i zero = _mm_setzero_si128(); + const __m128i dc = _mm_unpacklo_epi16(v, zero); + const __m128i ac = _mm_unpackhi_epi16(v, zero); + return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static VPX_FORCE_INLINE void highbd_load_fp_values( + const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) { + *round = highbd_init_256(round_ptr); + *quant = highbd_init_256(quant_ptr); + *dequant = highbd_init_256(dequant_ptr); +} + +static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob( + const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) { + const __m256i packed_nz_mask = + _mm256_packs_epi32(nz_mask, _mm256_setzero_si256()); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +static VPX_FORCE_INLINE void highbd_quantize_fp( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); + + *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int step = 8; + __m256i round, quant, dequant; + __m256i eob_max = _mm256_setzero_si256(); + (void)scan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += step; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + + // AC only loop + while (n_coeffs < 0) { + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += step; + } + + *eob_ptr = get_max_eob(eob_max); +} + +static VPX_FORCE_INLINE void highbd_quantize_fp_32x32( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr); + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); + const __m256i abs_dq = + _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); + + *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void vp9_highbd_quantize_fp_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + const int step = 8; + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + (void)scan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + thr = _mm256_srli_epi32(dequant, 2); + // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when + // calculating the zbin mask. + thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1)); + quant = _mm256_slli_epi32(quant, 1); + round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1); + + highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += step; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_permute2x128_si256(thr, thr, 0x31); + + // AC only loop + while (n_coeffs < 0) { + highbd_quantize_fp_32x32( + &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, + qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += step; + } + + *eob_ptr = get_max_eob(eob_max); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c new file mode 100644 index 0000000000..c877234436 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <emmintrin.h> +#include <xmmintrin.h> + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" + +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i eob; + + (void)scan; + + // Setup global values. + load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + __m128i eob0; + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c new file mode 100644 index 0000000000..d35004e370 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <tmmintrin.h> + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" + +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i eob; + + (void)scan; + + // Setup global values. + load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + __m128i eob0; + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one_s16 = _mm_set1_epi16(1); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i eob; + + (void)scan; + + // Setup global values. + load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + // The 32x32 halves round. + round = _mm_add_epi16(round, one_s16); + round = _mm_srli_epi16(round, 1); + + // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so + // upshift quant to account for this. + quant = _mm_slli_epi16(quant, 1); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + // Get the abs value of qcoeff again so we can use shifts for division. + qcoeff0 = _mm_abs_epi16(qcoeff0); + qcoeff1 = _mm_abs_epi16(qcoeff1); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + // Divide by 2. + qcoeff0 = _mm_srli_epi16(qcoeff0, 1); + qcoeff1 = _mm_srli_epi16(qcoeff1, 1); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 2); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + // Get the abs value of qcoeff again so we can use shifts for division. + qcoeff0 = _mm_abs_epi16(qcoeff0); + qcoeff1 = _mm_abs_epi16(qcoeff1); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + // Divide by 2. + qcoeff0 = _mm_srli_epi16(qcoeff0, 1); + qcoeff1 = _mm_srli_epi16(qcoeff1, 1); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + if (nzflag) { + const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} |