diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:35:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:35:49 +0000 |
commit | d8bbc7858622b6d9c278469aab701ca0b609cddf (patch) | |
tree | eff41dc61d9f714852212739e6b3738b82a2af87 /third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c | |
parent | Releasing progress-linux version 125.0.3-1~progress7.99u1. (diff) | |
download | firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.tar.xz firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.zip |
Merging upstream version 126.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c')
-rw-r--r-- | third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c | 148 |
1 files changed, 98 insertions, 50 deletions
diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c index 87c76fa13b..ff69ae75f5 100644 --- a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c +++ b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c @@ -17,64 +17,112 @@ #include "aom_ports/mem.h" #include "aom_dsp/flow_estimation/corner_match.h" -DECLARE_ALIGNED(16, static const uint8_t, - byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0, 0, 0 }; -#if MATCH_SZ != 13 -#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +DECLARE_ALIGNED(32, static const uint16_t, ones_array[16]) = { 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1 }; + +#if MATCH_SZ != 16 +#error "Need to apply pixel mask in corner_match_avx2.c if MATCH_SZ != 16" #endif -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the -correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows -of each image, centered at (x1, y1) and (x2, y2) respectively. +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev + + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * <true mean> and + *one_over_stddev = 1 / (MATCH_SZ * <true stddev>) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. */ -double av1_compute_cross_correlation_avx2(const unsigned char *frame1, - int stride1, int x1, int y1, - const unsigned char *frame2, - int stride2, int x2, int y2) { - int i, stride1_i = 0, stride2_i = 0; - __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1; - const __m128i mask = _mm_load_si128((__m128i *)byte_mask); - const __m256i zero = _mm256_setzero_si256(); - __m128i v1, v2; - - sum_vec = zero; - sumsq2_vec = zero; - cross_vec = zero; +bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, + int y, double *mean, + double *one_over_stddev) { + __m256i sum_vec = _mm256_setzero_si256(); + __m256i sumsq_vec = _mm256_setzero_si256(); + + frame += (y - MATCH_SZ_BY2) * stride + (x - MATCH_SZ_BY2); + + for (int i = 0; i < MATCH_SZ; ++i) { + const __m256i v = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame)); + + sum_vec = _mm256_add_epi16(sum_vec, v); + sumsq_vec = _mm256_add_epi32(sumsq_vec, _mm256_madd_epi16(v, v)); + + frame += stride; + } + + // Reduce sum_vec and sumsq_vec into single values + // Start by reducing each vector to 8x32-bit values, hadd() to perform 8 + // additions, sum vertically to do 4 more, then the last 2 in scalar code. + const __m256i ones = _mm256_load_si256((__m256i *)ones_array); + const __m256i partial_sum = _mm256_madd_epi16(sum_vec, ones); + const __m256i tmp_8x32 = _mm256_hadd_epi32(partial_sum, sumsq_vec); + const __m128i tmp_4x32 = _mm_add_epi32(_mm256_extracti128_si256(tmp_8x32, 0), + _mm256_extracti128_si256(tmp_8x32, 1)); + const int sum = + _mm_extract_epi32(tmp_4x32, 0) + _mm_extract_epi32(tmp_4x32, 1); + const int sumsq = + _mm_extract_epi32(tmp_4x32, 2) + _mm_extract_epi32(tmp_4x32, 3); + + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; +} + +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. +*/ +double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { + __m256i cross_vec = _mm256_setzero_si256(); frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); - for (i = 0; i < MATCH_SZ; ++i) { - v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[stride1_i]), mask); - v1_1 = _mm256_cvtepu8_epi16(v1); - v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[stride2_i]), mask); - v2_1 = _mm256_cvtepu8_epi16(v2); + for (int i = 0; i < MATCH_SZ; ++i) { + const __m256i v1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame1)); + const __m256i v2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame2)); - v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1); - sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1)); + cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1, v2)); - sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero)); - cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1)); - stride1_i += stride1; - stride2_i += stride2; + frame1 += stride1; + frame2 += stride2; } - __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8); - sum_vec = _mm256_add_epi32(sum_vec, sum_vec1); - int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec)); - int sum2_acc = _mm256_extract_epi32(sum_vec, 4); - - __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec); - __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec); - temp1 = _mm256_add_epi32(unp_low, unp_hig); - - __m128i low_sumsq = _mm256_castsi256_si128(temp1); - low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1)); - low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32)); - int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq); - int cross_acc = _mm_extract_epi32(low_sumsq, 2); - - int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc; - int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc; - return cov / sqrt((double)var2); + + // Sum cross_vec into a single value + const __m128i tmp = _mm_add_epi32(_mm256_extracti128_si256(cross_vec, 0), + _mm256_extracti128_si256(cross_vec, 1)); + const int cross = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1) + + _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); + + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + const double covariance = cross - mean1 * mean2; + const double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; } |