From d8bbc7858622b6d9c278469aab701ca0b609cddf Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 15 May 2024 05:35:49 +0200 Subject: Merging upstream version 126.0. Signed-off-by: Daniel Baumann --- .../aom/aom_dsp/flow_estimation/corner_detect.c | 44 ++- .../aom/aom_dsp/flow_estimation/corner_detect.h | 5 +- .../aom/aom_dsp/flow_estimation/corner_match.c | 317 ++++++++------- .../aom/aom_dsp/flow_estimation/corner_match.h | 12 +- third_party/aom/aom_dsp/flow_estimation/disflow.c | 36 +- third_party/aom/aom_dsp/flow_estimation/disflow.h | 11 +- .../aom/aom_dsp/flow_estimation/flow_estimation.c | 20 +- .../aom/aom_dsp/flow_estimation/flow_estimation.h | 7 +- third_party/aom/aom_dsp/flow_estimation/ransac.c | 349 +++++++++-------- .../flow_estimation/x86/corner_match_avx2.c | 148 ++++--- .../flow_estimation/x86/corner_match_sse4.c | 171 +++++---- .../aom/aom_dsp/flow_estimation/x86/disflow_avx2.c | 417 ++++++++++++++++++++ .../aom/aom_dsp/flow_estimation/x86/disflow_sse4.c | 424 +++++++-------------- 13 files changed, 1192 insertions(+), 769 deletions(-) create mode 100644 third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c (limited to 'third_party/aom/aom_dsp/flow_estimation') diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_detect.c b/third_party/aom/aom_dsp/flow_estimation/corner_detect.c index 284d1bd7b8..44d423dcdf 100644 --- a/third_party/aom/aom_dsp/flow_estimation/corner_detect.c +++ b/third_party/aom/aom_dsp/flow_estimation/corner_detect.c @@ -20,6 +20,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_mem/aom_mem.h" +#include "aom_util/aom_pthread.h" #include "av1/common/common.h" #define FAST_BARRIER 18 @@ -39,11 +40,24 @@ CornerList *av1_alloc_corner_list(void) { return corners; } -static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { - const uint8_t *buf = pyr->layers[0].buffer; - int width = pyr->layers[0].width; - int height = pyr->layers[0].height; - int stride = pyr->layers[0].stride; +static bool compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners) { + ImagePyramid *pyr = frame->y_pyramid; + const int layers = + aom_compute_pyramid(frame, bit_depth, downsample_level + 1, pyr); + + if (layers < 0) { + return false; + } + + // Clamp downsampling ratio base on max number of layers allowed + // for this frame size + downsample_level = layers - 1; + + const uint8_t *buf = pyr->layers[downsample_level].buffer; + int width = pyr->layers[downsample_level].width; + int height = pyr->layers[downsample_level].height; + int stride = pyr->layers[downsample_level].stride; int *scores = NULL; int num_corners; @@ -53,9 +67,11 @@ static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { if (num_corners <= MAX_CORNERS) { // Use all detected corners - if (num_corners != 0) { - memcpy(corners->corners, frame_corners_xy, - sizeof(*frame_corners_xy) * num_corners); + for (int i = 0; i < num_corners; i++) { + corners->corners[2 * i + 0] = + frame_corners_xy[i].x * (1 << downsample_level); + corners->corners[2 * i + 1] = + frame_corners_xy[i].y * (1 << downsample_level); } corners->num_corners = num_corners; } else { @@ -85,8 +101,10 @@ static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { for (int i = 0; i < num_corners; i++) { if (scores[i] > threshold) { assert(copied_corners < MAX_CORNERS); - corners->corners[2 * copied_corners + 0] = frame_corners_xy[i].x; - corners->corners[2 * copied_corners + 1] = frame_corners_xy[i].y; + corners->corners[2 * copied_corners + 0] = + frame_corners_xy[i].x * (1 << downsample_level); + corners->corners[2 * copied_corners + 1] = + frame_corners_xy[i].y * (1 << downsample_level); copied_corners += 1; } } @@ -99,7 +117,8 @@ static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { return true; } -bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { +bool av1_compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners) { assert(corners); #if CONFIG_MULTITHREAD @@ -107,7 +126,8 @@ bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { #endif // CONFIG_MULTITHREAD if (!corners->valid) { - corners->valid = compute_corner_list(pyr, corners); + corners->valid = + compute_corner_list(frame, bit_depth, downsample_level, corners); } bool valid = corners->valid; diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_detect.h b/third_party/aom/aom_dsp/flow_estimation/corner_detect.h index d05846ce5d..54d94309ed 100644 --- a/third_party/aom/aom_dsp/flow_estimation/corner_detect.h +++ b/third_party/aom/aom_dsp/flow_estimation/corner_detect.h @@ -18,7 +18,7 @@ #include #include "aom_dsp/pyramid.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" #ifdef __cplusplus extern "C" { @@ -57,7 +57,8 @@ size_t av1_get_corner_list_size(void); CornerList *av1_alloc_corner_list(void); -bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners); +bool av1_compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners); #ifndef NDEBUG // Check if a corner list has already been computed. diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.c b/third_party/aom/aom_dsp/flow_estimation/corner_match.c index dc7589a8c6..c78edb8910 100644 --- a/third_party/aom/aom_dsp/flow_estimation/corner_match.c +++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.c @@ -17,62 +17,84 @@ #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_dsp/flow_estimation/corner_match.h" +#include "aom_dsp/flow_estimation/disflow.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_dsp/flow_estimation/ransac.h" #include "aom_dsp/pyramid.h" #include "aom_scale/yv12config.h" -#define SEARCH_SZ 9 -#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2) - #define THRESHOLD_NCC 0.75 -/* Compute var(frame) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of frame, - centered at (x, y). +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev + + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * and + *one_over_stddev = 1 / (MATCH_SZ * ) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. */ -static double compute_variance(const unsigned char *frame, int stride, int x, - int y) { +bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, + int y, double *mean, double *one_over_stddev) { int sum = 0; int sumsq = 0; - int var; - int i, j; - for (i = 0; i < MATCH_SZ; ++i) - for (j = 0; j < MATCH_SZ; ++j) { + for (int i = 0; i < MATCH_SZ; ++i) { + for (int j = 0; j < MATCH_SZ; ++j) { sum += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; sumsq += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] * frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; } - var = sumsq * MATCH_SZ_SQ - sum * sum; - return (double)var; + } + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; } -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the - correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows - of each image, centered at (x1, y1) and (x2, y2) respectively. +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. */ -double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, - int x1, int y1, - const unsigned char *frame2, int stride2, - int x2, int y2) { +double aom_compute_correlation_c(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { int v1, v2; - int sum1 = 0; - int sum2 = 0; - int sumsq2 = 0; int cross = 0; - int var2, cov; - int i, j; - for (i = 0; i < MATCH_SZ; ++i) - for (j = 0; j < MATCH_SZ; ++j) { + for (int i = 0; i < MATCH_SZ; ++i) { + for (int j = 0; j < MATCH_SZ; ++j) { v1 = frame1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)]; v2 = frame2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)]; - sum1 += v1; - sum2 += v2; - sumsq2 += v2 * v2; cross += v1 * v2; } - var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; - cov = cross * MATCH_SZ_SQ - sum1 * sum2; - return cov / sqrt((double)var2); + } + + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + double covariance = cross - mean1 * mean2; + double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; } static int is_eligible_point(int pointx, int pointy, int width, int height) { @@ -87,65 +109,14 @@ static int is_eligible_distance(int point1x, int point1y, int point2x, (point1y - point2y) * (point1y - point2y)) <= thresh * thresh; } -static void improve_correspondence(const unsigned char *src, - const unsigned char *ref, int width, - int height, int src_stride, int ref_stride, - Correspondence *correspondences, - int num_correspondences) { - int i; - for (i = 0; i < num_correspondences; ++i) { - int x, y, best_x = 0, best_y = 0; - double best_match_ncc = 0.0; - // For this algorithm, all points have integer coordinates. - // It's a little more efficient to convert them to ints once, - // before the inner loops - int x0 = (int)correspondences[i].x; - int y0 = (int)correspondences[i].y; - int rx0 = (int)correspondences[i].rx; - int ry0 = (int)correspondences[i].ry; - for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) { - for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { - double match_ncc; - if (!is_eligible_point(rx0 + x, ry0 + y, width, height)) continue; - if (!is_eligible_distance(x0, y0, rx0 + x, ry0 + y, width, height)) - continue; - match_ncc = av1_compute_cross_correlation(src, src_stride, x0, y0, ref, - ref_stride, rx0 + x, ry0 + y); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_y = y; - best_x = x; - } - } - } - correspondences[i].rx += best_x; - correspondences[i].ry += best_y; - } - for (i = 0; i < num_correspondences; ++i) { - int x, y, best_x = 0, best_y = 0; - double best_match_ncc = 0.0; - int x0 = (int)correspondences[i].x; - int y0 = (int)correspondences[i].y; - int rx0 = (int)correspondences[i].rx; - int ry0 = (int)correspondences[i].ry; - for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) - for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { - double match_ncc; - if (!is_eligible_point(x0 + x, y0 + y, width, height)) continue; - if (!is_eligible_distance(x0 + x, y0 + y, rx0, ry0, width, height)) - continue; - match_ncc = av1_compute_cross_correlation( - ref, ref_stride, rx0, ry0, src, src_stride, x0 + x, y0 + y); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_y = y; - best_x = x; - } - } - correspondences[i].x += best_x; - correspondences[i].y += best_y; - } -} +typedef struct { + int x; + int y; + double mean; + double one_over_stddev; + int best_match_idx; + double best_match_corr; +} PointInfo; static int determine_correspondence(const unsigned char *src, const int *src_corners, int num_src_corners, @@ -154,56 +125,136 @@ static int determine_correspondence(const unsigned char *src, int width, int height, int src_stride, int ref_stride, Correspondence *correspondences) { - // TODO(sarahparker) Improve this to include 2-way match - int i, j; + PointInfo *src_point_info = NULL; + PointInfo *ref_point_info = NULL; int num_correspondences = 0; - for (i = 0; i < num_src_corners; ++i) { - double best_match_ncc = 0.0; - double template_norm; - int best_match_j = -1; - if (!is_eligible_point(src_corners[2 * i], src_corners[2 * i + 1], width, - height)) + + src_point_info = + (PointInfo *)aom_calloc(num_src_corners, sizeof(*src_point_info)); + if (!src_point_info) { + goto finished; + } + + ref_point_info = + (PointInfo *)aom_calloc(num_ref_corners, sizeof(*ref_point_info)); + if (!ref_point_info) { + goto finished; + } + + // First pass (linear): + // Filter corner lists and compute per-patch means and standard deviations, + // for the src and ref frames independently + int src_point_count = 0; + for (int i = 0; i < num_src_corners; i++) { + int src_x = src_corners[2 * i]; + int src_y = src_corners[2 * i + 1]; + if (!is_eligible_point(src_x, src_y, width, height)) continue; + + PointInfo *point = &src_point_info[src_point_count]; + point->x = src_x; + point->y = src_y; + point->best_match_corr = THRESHOLD_NCC; + if (!aom_compute_mean_stddev(src, src_stride, src_x, src_y, &point->mean, + &point->one_over_stddev)) continue; - for (j = 0; j < num_ref_corners; ++j) { - double match_ncc; - if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width, - height)) - continue; - if (!is_eligible_distance(src_corners[2 * i], src_corners[2 * i + 1], - ref_corners[2 * j], ref_corners[2 * j + 1], - width, height)) + src_point_count++; + } + if (src_point_count == 0) { + goto finished; + } + + int ref_point_count = 0; + for (int j = 0; j < num_ref_corners; j++) { + int ref_x = ref_corners[2 * j]; + int ref_y = ref_corners[2 * j + 1]; + if (!is_eligible_point(ref_x, ref_y, width, height)) continue; + + PointInfo *point = &ref_point_info[ref_point_count]; + point->x = ref_x; + point->y = ref_y; + point->best_match_corr = THRESHOLD_NCC; + if (!aom_compute_mean_stddev(ref, ref_stride, ref_x, ref_y, &point->mean, + &point->one_over_stddev)) + continue; + ref_point_count++; + } + if (ref_point_count == 0) { + goto finished; + } + + // Second pass (quadratic): + // For each pair of points, compute correlation, and use this to determine + // the best match of each corner, in both directions + for (int i = 0; i < src_point_count; ++i) { + PointInfo *src_point = &src_point_info[i]; + for (int j = 0; j < ref_point_count; ++j) { + PointInfo *ref_point = &ref_point_info[j]; + if (!is_eligible_distance(src_point->x, src_point->y, ref_point->x, + ref_point->y, width, height)) continue; - match_ncc = av1_compute_cross_correlation( - src, src_stride, src_corners[2 * i], src_corners[2 * i + 1], ref, - ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_match_j = j; + + double corr = aom_compute_correlation( + src, src_stride, src_point->x, src_point->y, src_point->mean, + src_point->one_over_stddev, ref, ref_stride, ref_point->x, + ref_point->y, ref_point->mean, ref_point->one_over_stddev); + + if (corr > src_point->best_match_corr) { + src_point->best_match_idx = j; + src_point->best_match_corr = corr; + } + if (corr > ref_point->best_match_corr) { + ref_point->best_match_idx = i; + ref_point->best_match_corr = corr; } } - // Note: We want to test if the best correlation is >= THRESHOLD_NCC, - // but need to account for the normalization in - // av1_compute_cross_correlation. - template_norm = compute_variance(src, src_stride, src_corners[2 * i], - src_corners[2 * i + 1]); - if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) { - correspondences[num_correspondences].x = src_corners[2 * i]; - correspondences[num_correspondences].y = src_corners[2 * i + 1]; - correspondences[num_correspondences].rx = ref_corners[2 * best_match_j]; - correspondences[num_correspondences].ry = - ref_corners[2 * best_match_j + 1]; + } + + // Third pass (linear): + // Scan through source corners, generating a correspondence for each corner + // iff ref_best_match[src_best_match[i]] == i + // Then refine the generated correspondences using optical flow + for (int i = 0; i < src_point_count; i++) { + PointInfo *point = &src_point_info[i]; + + // Skip corners which were not matched, or which didn't find + // a good enough match + if (point->best_match_corr < THRESHOLD_NCC) continue; + + PointInfo *match_point = &ref_point_info[point->best_match_idx]; + if (match_point->best_match_idx == i) { + // Refine match using optical flow and store + const int sx = point->x; + const int sy = point->y; + const int rx = match_point->x; + const int ry = match_point->y; + double u = (double)(rx - sx); + double v = (double)(ry - sy); + + const int patch_tl_x = sx - DISFLOW_PATCH_CENTER; + const int patch_tl_y = sy - DISFLOW_PATCH_CENTER; + + aom_compute_flow_at_point(src, ref, patch_tl_x, patch_tl_y, width, height, + src_stride, &u, &v); + + Correspondence *correspondence = &correspondences[num_correspondences]; + correspondence->x = (double)sx; + correspondence->y = (double)sy; + correspondence->rx = (double)sx + u; + correspondence->ry = (double)sy + v; num_correspondences++; } } - improve_correspondence(src, ref, width, height, src_stride, ref_stride, - correspondences, num_correspondences); + +finished: + aom_free(src_point_info); + aom_free(ref_point_info); return num_correspondences; } bool av1_compute_global_motion_feature_match( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, - int bit_depth, MotionModel *motion_models, int num_motion_models, - bool *mem_alloc_failed) { + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed) { int num_correspondences; Correspondence *correspondences; ImagePyramid *src_pyramid = src->y_pyramid; @@ -212,19 +263,19 @@ bool av1_compute_global_motion_feature_match( CornerList *ref_corners = ref->corners; // Precompute information we will need about each frame - if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) { + if (aom_compute_pyramid(src, bit_depth, 1, src_pyramid) < 0) { *mem_alloc_failed = true; return false; } - if (!av1_compute_corner_list(src_pyramid, src_corners)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, src_corners)) { *mem_alloc_failed = true; return false; } - if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) { + if (aom_compute_pyramid(ref, bit_depth, 1, ref_pyramid) < 0) { *mem_alloc_failed = true; return false; } - if (!av1_compute_corner_list(ref_pyramid, ref_corners)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, ref_corners)) { *mem_alloc_failed = true; return false; } diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.h b/third_party/aom/aom_dsp/flow_estimation/corner_match.h index 4435d2c767..77ebee2ea3 100644 --- a/third_party/aom/aom_dsp/flow_estimation/corner_match.h +++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.h @@ -25,14 +25,20 @@ extern "C" { #endif -#define MATCH_SZ 13 +#define MATCH_SZ 16 #define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2) #define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ) +// Minimum threshold for the variance of a patch, in order for it to be +// considered useful for matching. +// This is evaluated against the scaled variance MATCH_SZ_SQ * sigma^2, +// so a setting of 1 * MATCH_SZ_SQ corresponds to an unscaled variance of 1 +#define MIN_FEATURE_VARIANCE (1 * MATCH_SZ_SQ) + bool av1_compute_global_motion_feature_match( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, - int bit_depth, MotionModel *motion_models, int num_motion_models, - bool *mem_alloc_failed); + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus } diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.c b/third_party/aom/aom_dsp/flow_estimation/disflow.c index 82b531c729..f511a6eb49 100644 --- a/third_party/aom/aom_dsp/flow_estimation/disflow.c +++ b/third_party/aom/aom_dsp/flow_estimation/disflow.c @@ -603,9 +603,9 @@ static void upscale_flow_component(double *flow, int cur_width, int cur_height, // make sure flow_u and flow_v start at 0 static bool compute_flow_field(const ImagePyramid *src_pyr, - const ImagePyramid *ref_pyr, FlowField *flow) { + const ImagePyramid *ref_pyr, int n_levels, + FlowField *flow) { bool mem_status = true; - assert(src_pyr->n_levels == ref_pyr->n_levels); double *flow_u = flow->u; double *flow_v = flow->v; @@ -613,7 +613,7 @@ static bool compute_flow_field(const ImagePyramid *src_pyr, double *tmpbuf0; double *tmpbuf; - if (src_pyr->n_levels < 2) { + if (n_levels < 2) { // tmpbuf not needed tmpbuf0 = NULL; tmpbuf = NULL; @@ -639,7 +639,7 @@ static bool compute_flow_field(const ImagePyramid *src_pyr, // correspondences by interpolating this flow field, and then refine the // correspondences themselves. This is both faster and gives better output // compared to refining the flow field at level 0 and then interpolating. - for (int level = src_pyr->n_levels - 1; level >= 1; --level) { + for (int level = n_levels - 1; level >= 1; --level) { const PyramidLayer *cur_layer = &src_pyr->layers[level]; const int cur_width = cur_layer->width; const int cur_height = cur_layer->height; @@ -762,29 +762,31 @@ static void free_flow_field(FlowField *flow) { // Following the convention in flow_estimation.h, the flow vectors are computed // at fixed points in `src` and point to the corresponding locations in `ref`, // regardless of the temporal ordering of the frames. -bool av1_compute_global_motion_disflow(TransformationType type, - YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *ref, int bit_depth, - MotionModel *motion_models, - int num_motion_models, - bool *mem_alloc_failed) { +bool av1_compute_global_motion_disflow( + TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed) { // Precompute information we will need about each frame ImagePyramid *src_pyramid = src->y_pyramid; CornerList *src_corners = src->corners; ImagePyramid *ref_pyramid = ref->y_pyramid; - if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) { - *mem_alloc_failed = true; - return false; - } - if (!av1_compute_corner_list(src_pyramid, src_corners)) { + + const int src_layers = + aom_compute_pyramid(src, bit_depth, DISFLOW_PYRAMID_LEVELS, src_pyramid); + const int ref_layers = + aom_compute_pyramid(ref, bit_depth, DISFLOW_PYRAMID_LEVELS, ref_pyramid); + + if (src_layers < 0 || ref_layers < 0) { *mem_alloc_failed = true; return false; } - if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, src_corners)) { *mem_alloc_failed = true; return false; } + assert(src_layers == ref_layers); + const int src_width = src_pyramid->layers[0].width; const int src_height = src_pyramid->layers[0].height; assert(ref_pyramid->layers[0].width == src_width); @@ -796,7 +798,7 @@ bool av1_compute_global_motion_disflow(TransformationType type, return false; } - if (!compute_flow_field(src_pyramid, ref_pyramid, flow)) { + if (!compute_flow_field(src_pyramid, ref_pyramid, src_layers, flow)) { *mem_alloc_failed = true; free_flow_field(flow); return false; diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.h b/third_party/aom/aom_dsp/flow_estimation/disflow.h index ef877b638c..ac3680004d 100644 --- a/third_party/aom/aom_dsp/flow_estimation/disflow.h +++ b/third_party/aom/aom_dsp/flow_estimation/disflow.h @@ -15,7 +15,6 @@ #include #include "aom_dsp/flow_estimation/flow_estimation.h" -#include "aom_dsp/rect.h" #include "aom_scale/yv12config.h" #ifdef __cplusplus @@ -92,12 +91,10 @@ typedef struct { int stride; } FlowField; -bool av1_compute_global_motion_disflow(TransformationType type, - YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *ref, int bit_depth, - MotionModel *motion_models, - int num_motion_models, - bool *mem_alloc_failed); +bool av1_compute_global_motion_disflow( + TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus } diff --git a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c index 0f47f86f55..96624eb863 100644 --- a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c +++ b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c @@ -18,14 +18,6 @@ #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" -// For each global motion method, how many pyramid levels should we allocate? -// Note that this is a maximum, and fewer levels will be allocated if the frame -// is not large enough to need all of the specified levels -const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS] = { - 1, // GLOBAL_MOTION_METHOD_FEATURE_MATCH - 16, // GLOBAL_MOTION_METHOD_DISFLOW -}; - // clang-format off const double kIdentityParams[MAX_PARAMDIM] = { 0.0, 0.0, 1.0, 0.0, 0.0, 1.0 @@ -43,17 +35,17 @@ const double kIdentityParams[MAX_PARAMDIM] = { bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, GlobalMotionMethod gm_method, - MotionModel *motion_models, + int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed) { switch (gm_method) { case GLOBAL_MOTION_METHOD_FEATURE_MATCH: return av1_compute_global_motion_feature_match( - type, src, ref, bit_depth, motion_models, num_motion_models, - mem_alloc_failed); + type, src, ref, bit_depth, downsample_level, motion_models, + num_motion_models, mem_alloc_failed); case GLOBAL_MOTION_METHOD_DISFLOW: - return av1_compute_global_motion_disflow(type, src, ref, bit_depth, - motion_models, num_motion_models, - mem_alloc_failed); + return av1_compute_global_motion_disflow( + type, src, ref, bit_depth, downsample_level, motion_models, + num_motion_models, mem_alloc_failed); default: assert(0 && "Unknown global motion estimation type"); } return false; diff --git a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h index 2dfae24980..a38b03fc4e 100644 --- a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h +++ b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h @@ -61,11 +61,6 @@ typedef struct { double rx, ry; } Correspondence; -// For each global motion method, how many pyramid levels should we allocate? -// Note that this is a maximum, and fewer levels will be allocated if the frame -// is not large enough to need all of the specified levels -extern const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS]; - // Which global motion method should we use in practice? // Disflow is both faster and gives better results than feature matching in // practically all cases, so we use disflow by default @@ -85,7 +80,7 @@ extern const double kIdentityParams[MAX_PARAMDIM]; bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, GlobalMotionMethod gm_method, - MotionModel *motion_models, + int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus diff --git a/third_party/aom/aom_dsp/flow_estimation/ransac.c b/third_party/aom/aom_dsp/flow_estimation/ransac.c index b88a07b023..7c7bebdda4 100644 --- a/third_party/aom/aom_dsp/flow_estimation/ransac.c +++ b/third_party/aom/aom_dsp/flow_estimation/ransac.c @@ -29,8 +29,13 @@ #define INLIER_THRESHOLD 1.25 #define INLIER_THRESHOLD_SQUARED (INLIER_THRESHOLD * INLIER_THRESHOLD) + +// Number of initial models to generate #define NUM_TRIALS 20 +// Number of times to refine the best model found +#define NUM_REFINES 5 + // Flag to enable functions for finding TRANSLATION type models. // // These modes are not considered currently due to a spec bug (see comments @@ -39,63 +44,110 @@ // but disabled, for completeness. #define ALLOW_TRANSLATION_MODELS 0 +typedef struct { + int num_inliers; + double sse; // Sum of squared errors of inliers + int *inlier_indices; +} RANSAC_MOTION; + //////////////////////////////////////////////////////////////////////////////// // ransac -typedef bool (*IsDegenerateFunc)(double *p); -typedef bool (*FindTransformationFunc)(int points, const double *points1, - const double *points2, double *params); -typedef void (*ProjectPointsFunc)(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj); +typedef bool (*FindTransformationFunc)(const Correspondence *points, + const int *indices, int num_indices, + double *params); +typedef void (*ScoreModelFunc)(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model); // vtable-like structure which stores all of the information needed by RANSAC // for a particular model type typedef struct { - IsDegenerateFunc is_degenerate; FindTransformationFunc find_transformation; - ProjectPointsFunc project_points; + ScoreModelFunc score_model; + + // The minimum number of points which can be passed to find_transformation + // to generate a model. + // + // This should be set as small as possible. This is due to an observation + // from section 4 of "Optimal Ransac" by A. Hast, J. Nysjö and + // A. Marchetti (https://dspace5.zcu.cz/bitstream/11025/6869/1/Hast.pdf): + // using the minimum possible number of points in the initial model maximizes + // the chance that all of the selected points are inliers. + // + // That paper proposes a method which can deal with models which are + // contaminated by outliers, which helps in cases where the inlier fraction + // is low. However, for our purposes, global motion only gives significant + // gains when the inlier fraction is high. + // + // So we do not use the method from this paper, but we do find that + // minimizing the number of points used for initial model fitting helps + // make the best use of the limited number of models we consider. int minpts; } RansacModelInfo; #if ALLOW_TRANSLATION_MODELS -static void project_points_translation(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj) { - int i; - for (i = 0; i < n; ++i) { - const double x = *(points++), y = *(points++); - *(proj++) = x + mat[0]; - *(proj++) = y + mat[1]; - points += stride_points - 2; - proj += stride_proj - 2; +static void score_translation(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model) { + model->num_inliers = 0; + model->sse = 0.0; + + for (int i = 0; i < num_points; ++i) { + const double x1 = points[i].x; + const double y1 = points[i].y; + const double x2 = points[i].rx; + const double y2 = points[i].ry; + + const double proj_x = x1 + mat[0]; + const double proj_y = y1 + mat[1]; + + const double dx = proj_x - x2; + const double dy = proj_y - y2; + const double sse = dx * dx + dy * dy; + + if (sse < INLIER_THRESHOLD_SQUARED) { + model->inlier_indices[model->num_inliers++] = i; + model->sse += sse; + } } } #endif // ALLOW_TRANSLATION_MODELS -static void project_points_affine(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj) { - int i; - for (i = 0; i < n; ++i) { - const double x = *(points++), y = *(points++); - *(proj++) = mat[2] * x + mat[3] * y + mat[0]; - *(proj++) = mat[4] * x + mat[5] * y + mat[1]; - points += stride_points - 2; - proj += stride_proj - 2; +static void score_affine(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model) { + model->num_inliers = 0; + model->sse = 0.0; + + for (int i = 0; i < num_points; ++i) { + const double x1 = points[i].x; + const double y1 = points[i].y; + const double x2 = points[i].rx; + const double y2 = points[i].ry; + + const double proj_x = mat[2] * x1 + mat[3] * y1 + mat[0]; + const double proj_y = mat[4] * x1 + mat[5] * y1 + mat[1]; + + const double dx = proj_x - x2; + const double dy = proj_y - y2; + const double sse = dx * dx + dy * dy; + + if (sse < INLIER_THRESHOLD_SQUARED) { + model->inlier_indices[model->num_inliers++] = i; + model->sse += sse; + } } } #if ALLOW_TRANSLATION_MODELS -static bool find_translation(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_translation(const Correspondence *points, const int *indices, + int num_indices, double *params) { double sumx = 0; double sumy = 0; - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; sumx += dx - sx; sumy += dy - sy; @@ -111,8 +163,8 @@ static bool find_translation(int np, const double *pts1, const double *pts2, } #endif // ALLOW_TRANSLATION_MODELS -static bool find_rotzoom(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_rotzoom(const Correspondence *points, const int *indices, + int num_indices, double *params) { const int n = 4; // Size of least-squares problem double mat[4 * 4]; // Accumulator for A'A double y[4]; // Accumulator for A'b @@ -120,11 +172,12 @@ static bool find_rotzoom(int np, const double *pts1, const double *pts2, double b; // Single element of b least_squares_init(mat, y, n); - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; a[0] = 1; a[1] = 0; @@ -153,8 +206,8 @@ static bool find_rotzoom(int np, const double *pts1, const double *pts2, return true; } -static bool find_affine(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_affine(const Correspondence *points, const int *indices, + int num_indices, double *params) { // Note: The least squares problem for affine models is 6-dimensional, // but it splits into two independent 3-dimensional subproblems. // Solving these two subproblems separately and recombining at the end @@ -174,11 +227,12 @@ static bool find_affine(int np, const double *pts1, const double *pts2, least_squares_init(mat[0], y[0], n); least_squares_init(mat[1], y[1], n); - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; a[0][0] = 1; a[0][1] = sx; @@ -211,12 +265,6 @@ static bool find_affine(int np, const double *pts1, const double *pts2, return true; } -typedef struct { - int num_inliers; - double sse; // Sum of squared errors of inliers - int *inlier_indices; -} RANSAC_MOTION; - // Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise. static int compare_motions(const void *arg_a, const void *arg_b) { const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a; @@ -234,15 +282,6 @@ static bool is_better_motion(const RANSAC_MOTION *motion_a, return compare_motions(motion_a, motion_b) < 0; } -static void copy_points_at_indices(double *dest, const double *src, - const int *indices, int num_points) { - for (int i = 0; i < num_points; ++i) { - const int index = indices[i]; - dest[i * 2] = src[index * 2]; - dest[i * 2 + 1] = src[index * 2 + 1]; - } -} - // Returns true on success, false on error static bool ransac_internal(const Correspondence *matched_points, int npoints, MotionModel *motion_models, int num_desired_motions, @@ -257,10 +296,6 @@ static bool ransac_internal(const Correspondence *matched_points, int npoints, int indices[MAX_MINPTS] = { 0 }; - double *points1, *points2; - double *corners1, *corners2; - double *projected_corners; - // Store information for the num_desired_motions best transformations found // and the worst motion among them, as well as the motion currently under // consideration. @@ -271,18 +306,19 @@ static bool ransac_internal(const Correspondence *matched_points, int npoints, // currently under consideration. double params_this_motion[MAX_PARAMDIM]; + // Initialize output models, as a fallback in case we can't find a model + for (i = 0; i < num_desired_motions; i++) { + memcpy(motion_models[i].params, kIdentityParams, + MAX_PARAMDIM * sizeof(*(motion_models[i].params))); + motion_models[i].num_inliers = 0; + } + if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) { return false; } int min_inliers = AOMMAX((int)(MIN_INLIER_PROB * npoints), minpts); - points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2); - points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2); - corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2); - corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2); - projected_corners = - (double *)aom_malloc(sizeof(*projected_corners) * npoints * 2); motions = (RANSAC_MOTION *)aom_calloc(num_desired_motions, sizeof(RANSAC_MOTION)); @@ -295,8 +331,7 @@ static bool ransac_internal(const Correspondence *matched_points, int npoints, int *inlier_buffer = (int *)aom_malloc(sizeof(*inlier_buffer) * npoints * (num_desired_motions + 1)); - if (!(points1 && points2 && corners1 && corners2 && projected_corners && - motions && inlier_buffer)) { + if (!(motions && inlier_buffer)) { ret_val = false; *mem_alloc_failed = true; goto finish_ransac; @@ -311,50 +346,22 @@ static bool ransac_internal(const Correspondence *matched_points, int npoints, memset(¤t_motion, 0, sizeof(current_motion)); current_motion.inlier_indices = inlier_buffer + num_desired_motions * npoints; - for (i = 0; i < npoints; ++i) { - corners1[2 * i + 0] = matched_points[i].x; - corners1[2 * i + 1] = matched_points[i].y; - corners2[2 * i + 0] = matched_points[i].rx; - corners2[2 * i + 1] = matched_points[i].ry; - } - for (int trial_count = 0; trial_count < NUM_TRIALS; trial_count++) { lcg_pick(npoints, minpts, indices, &seed); - copy_points_at_indices(points1, corners1, indices, minpts); - copy_points_at_indices(points2, corners2, indices, minpts); - - if (model_info->is_degenerate(points1)) { - continue; - } - - if (!model_info->find_transformation(minpts, points1, points2, + if (!model_info->find_transformation(matched_points, indices, minpts, params_this_motion)) { continue; } - model_info->project_points(params_this_motion, corners1, projected_corners, - npoints, 2, 2); - - current_motion.num_inliers = 0; - double sse = 0.0; - for (i = 0; i < npoints; ++i) { - double dx = projected_corners[i * 2] - corners2[i * 2]; - double dy = projected_corners[i * 2 + 1] - corners2[i * 2 + 1]; - double squared_error = dx * dx + dy * dy; - - if (squared_error < INLIER_THRESHOLD_SQUARED) { - current_motion.inlier_indices[current_motion.num_inliers++] = i; - sse += squared_error; - } - } + model_info->score_model(params_this_motion, matched_points, npoints, + ¤t_motion); if (current_motion.num_inliers < min_inliers) { // Reject models with too few inliers continue; } - current_motion.sse = sse; if (is_better_motion(¤t_motion, worst_kept_motion)) { // This motion is better than the worst currently kept motion. Remember // the inlier points and sse. The parameters for each kept motion @@ -386,86 +393,98 @@ static bool ransac_internal(const Correspondence *matched_points, int npoints, // Sort the motions, best first. qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions); - // Recompute the motions using only the inliers. + // Refine each of the best N models using iterative estimation. + // + // The idea here is loosely based on the iterative method from + // "Locally Optimized RANSAC" by O. Chum, J. Matas and Josef Kittler: + // https://cmp.felk.cvut.cz/ftp/articles/matas/chum-dagm03.pdf + // + // However, we implement a simpler version than their proposal, and simply + // refit the model repeatedly until the number of inliers stops increasing, + // with a cap on the number of iterations to defend against edge cases which + // only improve very slowly. for (i = 0; i < num_desired_motions; ++i) { - int num_inliers = motions[i].num_inliers; - if (num_inliers > 0) { - assert(num_inliers >= minpts); - - copy_points_at_indices(points1, corners1, motions[i].inlier_indices, - num_inliers); - copy_points_at_indices(points2, corners2, motions[i].inlier_indices, - num_inliers); - - if (!model_info->find_transformation(num_inliers, points1, points2, - motion_models[i].params)) { - // In the unlikely event that this model fitting fails, - // we don't have a good fallback. So just clear the output - // model and move on - memcpy(motion_models[i].params, kIdentityParams, - MAX_PARAMDIM * sizeof(*(motion_models[i].params))); - motion_models[i].num_inliers = 0; - continue; + if (motions[i].num_inliers <= 0) { + // Output model has already been initialized to the identity model, + // so just skip setup + continue; + } + + bool bad_model = false; + for (int refine_count = 0; refine_count < NUM_REFINES; refine_count++) { + int num_inliers = motions[i].num_inliers; + assert(num_inliers >= min_inliers); + + if (!model_info->find_transformation(matched_points, + motions[i].inlier_indices, + num_inliers, params_this_motion)) { + // In the unlikely event that this model fitting fails, we don't have a + // good fallback. So leave this model set to the identity model + bad_model = true; + break; } - // Populate inliers array - for (int j = 0; j < num_inliers; j++) { - int index = motions[i].inlier_indices[j]; - const Correspondence *corr = &matched_points[index]; - motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x); - motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y); + // Score the newly generated model + model_info->score_model(params_this_motion, matched_points, npoints, + ¤t_motion); + + // At this point, there are three possibilities: + // 1) If we found more inliers, keep refining. + // 2) If we found the same number of inliers but a lower SSE, we want to + // keep the new model, but further refinement is unlikely to gain much. + // So commit to this new model + // 3) It is possible, but very unlikely, that the new model will have + // fewer inliers. If it does happen, we probably just lost a few + // borderline inliers. So treat the same as case (2). + if (current_motion.num_inliers > motions[i].num_inliers) { + motions[i].num_inliers = current_motion.num_inliers; + motions[i].sse = current_motion.sse; + int *tmp = motions[i].inlier_indices; + motions[i].inlier_indices = current_motion.inlier_indices; + current_motion.inlier_indices = tmp; + } else { + // Refined model is no better, so stop + // This shouldn't be significantly worse than the previous model, + // so it's fine to use the parameters in params_this_motion. + // This saves us from having to cache the previous iteration's params. + break; } - motion_models[i].num_inliers = num_inliers; - } else { - memcpy(motion_models[i].params, kIdentityParams, - MAX_PARAMDIM * sizeof(*(motion_models[i].params))); - motion_models[i].num_inliers = 0; } + + if (bad_model) continue; + + // Fill in output struct + memcpy(motion_models[i].params, params_this_motion, + MAX_PARAMDIM * sizeof(*motion_models[i].params)); + for (int j = 0; j < motions[i].num_inliers; j++) { + int index = motions[i].inlier_indices[j]; + const Correspondence *corr = &matched_points[index]; + motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x); + motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y); + } + motion_models[i].num_inliers = motions[i].num_inliers; } finish_ransac: aom_free(inlier_buffer); aom_free(motions); - aom_free(projected_corners); - aom_free(corners2); - aom_free(corners1); - aom_free(points2); - aom_free(points1); return ret_val; } -static bool is_collinear3(double *p1, double *p2, double *p3) { - static const double collinear_eps = 1e-3; - const double v = - (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]); - return fabs(v) < collinear_eps; -} - -#if ALLOW_TRANSLATION_MODELS -static bool is_degenerate_translation(double *p) { - return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2; -} -#endif // ALLOW_TRANSLATION_MODELS - -static bool is_degenerate_affine(double *p) { - return is_collinear3(p, p + 2, p + 4); -} - static const RansacModelInfo ransac_model_info[TRANS_TYPES] = { // IDENTITY - { NULL, NULL, NULL, 0 }, + { NULL, NULL, 0 }, // TRANSLATION #if ALLOW_TRANSLATION_MODELS - { is_degenerate_translation, find_translation, project_points_translation, - 3 }, + { find_translation, score_translation, 1 }, #else - { NULL, NULL, NULL, 0 }, + { NULL, NULL, 0 }, #endif // ROTZOOM - { is_degenerate_affine, find_rotzoom, project_points_affine, 3 }, + { find_rotzoom, score_affine, 2 }, // AFFINE - { is_degenerate_affine, find_affine, project_points_affine, 3 }, + { find_affine, score_affine, 3 }, }; // Returns true on success, false on error diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c index 87c76fa13b..ff69ae75f5 100644 --- a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c +++ b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c @@ -17,64 +17,112 @@ #include "aom_ports/mem.h" #include "aom_dsp/flow_estimation/corner_match.h" -DECLARE_ALIGNED(16, static const uint8_t, - byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0, 0, 0 }; -#if MATCH_SZ != 13 -#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +DECLARE_ALIGNED(32, static const uint16_t, ones_array[16]) = { 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1 }; + +#if MATCH_SZ != 16 +#error "Need to apply pixel mask in corner_match_avx2.c if MATCH_SZ != 16" #endif -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the -correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows -of each image, centered at (x1, y1) and (x2, y2) respectively. +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev + + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * and + *one_over_stddev = 1 / (MATCH_SZ * ) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. */ -double av1_compute_cross_correlation_avx2(const unsigned char *frame1, - int stride1, int x1, int y1, - const unsigned char *frame2, - int stride2, int x2, int y2) { - int i, stride1_i = 0, stride2_i = 0; - __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1; - const __m128i mask = _mm_load_si128((__m128i *)byte_mask); - const __m256i zero = _mm256_setzero_si256(); - __m128i v1, v2; - - sum_vec = zero; - sumsq2_vec = zero; - cross_vec = zero; +bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, + int y, double *mean, + double *one_over_stddev) { + __m256i sum_vec = _mm256_setzero_si256(); + __m256i sumsq_vec = _mm256_setzero_si256(); + + frame += (y - MATCH_SZ_BY2) * stride + (x - MATCH_SZ_BY2); + + for (int i = 0; i < MATCH_SZ; ++i) { + const __m256i v = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame)); + + sum_vec = _mm256_add_epi16(sum_vec, v); + sumsq_vec = _mm256_add_epi32(sumsq_vec, _mm256_madd_epi16(v, v)); + + frame += stride; + } + + // Reduce sum_vec and sumsq_vec into single values + // Start by reducing each vector to 8x32-bit values, hadd() to perform 8 + // additions, sum vertically to do 4 more, then the last 2 in scalar code. + const __m256i ones = _mm256_load_si256((__m256i *)ones_array); + const __m256i partial_sum = _mm256_madd_epi16(sum_vec, ones); + const __m256i tmp_8x32 = _mm256_hadd_epi32(partial_sum, sumsq_vec); + const __m128i tmp_4x32 = _mm_add_epi32(_mm256_extracti128_si256(tmp_8x32, 0), + _mm256_extracti128_si256(tmp_8x32, 1)); + const int sum = + _mm_extract_epi32(tmp_4x32, 0) + _mm_extract_epi32(tmp_4x32, 1); + const int sumsq = + _mm_extract_epi32(tmp_4x32, 2) + _mm_extract_epi32(tmp_4x32, 3); + + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; +} + +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. +*/ +double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { + __m256i cross_vec = _mm256_setzero_si256(); frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); - for (i = 0; i < MATCH_SZ; ++i) { - v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[stride1_i]), mask); - v1_1 = _mm256_cvtepu8_epi16(v1); - v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[stride2_i]), mask); - v2_1 = _mm256_cvtepu8_epi16(v2); + for (int i = 0; i < MATCH_SZ; ++i) { + const __m256i v1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame1)); + const __m256i v2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame2)); - v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1); - sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1)); + cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1, v2)); - sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero)); - cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1)); - stride1_i += stride1; - stride2_i += stride2; + frame1 += stride1; + frame2 += stride2; } - __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8); - sum_vec = _mm256_add_epi32(sum_vec, sum_vec1); - int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec)); - int sum2_acc = _mm256_extract_epi32(sum_vec, 4); - - __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec); - __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec); - temp1 = _mm256_add_epi32(unp_low, unp_hig); - - __m128i low_sumsq = _mm256_castsi256_si128(temp1); - low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1)); - low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32)); - int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq); - int cross_acc = _mm_extract_epi32(low_sumsq, 2); - - int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc; - int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc; - return cov / sqrt((double)var2); + + // Sum cross_vec into a single value + const __m128i tmp = _mm_add_epi32(_mm256_extracti128_si256(cross_vec, 0), + _mm256_extracti128_si256(cross_vec, 1)); + const int cross = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1) + + _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); + + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + const double covariance = cross - mean1 * mean2; + const double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; } diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c index b3cb5bc5fd..bff7db6d2f 100644 --- a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c +++ b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c @@ -21,84 +21,125 @@ #include "aom_ports/mem.h" #include "aom_dsp/flow_estimation/corner_match.h" -DECLARE_ALIGNED(16, static const uint8_t, - byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0, 0, 0 }; -#if MATCH_SZ != 13 -#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +DECLARE_ALIGNED(16, static const uint16_t, ones_array[8]) = { 1, 1, 1, 1, + 1, 1, 1, 1 }; + +#if MATCH_SZ != 16 +#error "Need to apply pixel mask in corner_match_sse4.c if MATCH_SZ != 16" #endif -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the - correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows - of each image, centered at (x1, y1) and (x2, y2) respectively. +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev + + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * and + *one_over_stddev = 1 / (MATCH_SZ * ) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. +*/ +bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, + int x, int y, double *mean, + double *one_over_stddev) { + // 8 16-bit partial sums of pixels + // Each lane sums at most 2*MATCH_SZ pixels, which can have values up to 255, + // and is therefore at most 2*MATCH_SZ*255, which is > 2^8 but < 2^16. + // Thus this value is safe to store in 16 bits. + __m128i sum_vec = _mm_setzero_si128(); + + // 8 32-bit partial sums of squares + __m128i sumsq_vec_l = _mm_setzero_si128(); + __m128i sumsq_vec_r = _mm_setzero_si128(); + + frame += (y - MATCH_SZ_BY2) * stride + (x - MATCH_SZ_BY2); + + for (int i = 0; i < MATCH_SZ; ++i) { + const __m128i v = _mm_loadu_si128((__m128i *)frame); + const __m128i v_l = _mm_cvtepu8_epi16(v); + const __m128i v_r = _mm_cvtepu8_epi16(_mm_srli_si128(v, 8)); + + sum_vec = _mm_add_epi16(sum_vec, _mm_add_epi16(v_l, v_r)); + sumsq_vec_l = _mm_add_epi32(sumsq_vec_l, _mm_madd_epi16(v_l, v_l)); + sumsq_vec_r = _mm_add_epi32(sumsq_vec_r, _mm_madd_epi16(v_r, v_r)); + + frame += stride; + } + + // Reduce sum_vec and sumsq_vec into single values + // Start by reducing each vector to 4x32-bit values, hadd() to perform four + // additions, then perform the last two additions in scalar code. + const __m128i ones = _mm_load_si128((__m128i *)ones_array); + const __m128i partial_sum = _mm_madd_epi16(sum_vec, ones); + const __m128i partial_sumsq = _mm_add_epi32(sumsq_vec_l, sumsq_vec_r); + const __m128i tmp = _mm_hadd_epi32(partial_sum, partial_sumsq); + const int sum = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1); + const int sumsq = _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); + + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; +} + +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. */ -double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, - int stride1, int x1, int y1, - const unsigned char *frame2, - int stride2, int x2, int y2) { - int i; - // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0, - // 2) - __m128i sum1_vec = _mm_setzero_si128(); - __m128i sum2_vec = _mm_setzero_si128(); - // 4 32-bit partial sums of squares - __m128i sumsq2_vec = _mm_setzero_si128(); - __m128i cross_vec = _mm_setzero_si128(); - - const __m128i mask = _mm_load_si128((__m128i *)byte_mask); - const __m128i zero = _mm_setzero_si128(); +double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { + // 8 32-bit partial sums of products + __m128i cross_vec_l = _mm_setzero_si128(); + __m128i cross_vec_r = _mm_setzero_si128(); frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); - for (i = 0; i < MATCH_SZ; ++i) { - const __m128i v1 = - _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[i * stride1]), mask); - const __m128i v2 = - _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[i * stride2]), mask); - - // Using the 'sad' intrinsic here is a bit faster than adding - // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit - // conversion step later, for a net speedup of ~10% - sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero)); - sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero)); + for (int i = 0; i < MATCH_SZ; ++i) { + const __m128i v1 = _mm_loadu_si128((__m128i *)frame1); + const __m128i v2 = _mm_loadu_si128((__m128i *)frame2); const __m128i v1_l = _mm_cvtepu8_epi16(v1); const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8)); const __m128i v2_l = _mm_cvtepu8_epi16(v2); const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8)); - sumsq2_vec = _mm_add_epi32( - sumsq2_vec, - _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r))); - cross_vec = _mm_add_epi32( - cross_vec, - _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r))); + cross_vec_l = _mm_add_epi32(cross_vec_l, _mm_madd_epi16(v1_l, v2_l)); + cross_vec_r = _mm_add_epi32(cross_vec_r, _mm_madd_epi16(v1_r, v2_r)); + + frame1 += stride1; + frame2 += stride2; } - // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec, - // cross_vec) - // as holding 4 32-bit elements each, which we want to sum horizontally. - // We do this by transposing and then summing vertically. - __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec); - __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec); - __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec); - __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec); - - __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2); - __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2); - __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3); - __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3); - - __m128i res = - _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7)); - - int sum1 = _mm_extract_epi32(res, 0); - int sum2 = _mm_extract_epi32(res, 1); - int sumsq2 = _mm_extract_epi32(res, 2); - int cross = _mm_extract_epi32(res, 3); - - int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; - int cov = cross * MATCH_SZ_SQ - sum1 * sum2; - return cov / sqrt((double)var2); + // Sum cross_vec into a single value + const __m128i tmp = _mm_add_epi32(cross_vec_l, cross_vec_r); + const int cross = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1) + + _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); + + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + const double covariance = cross - mean1 * mean2; + const double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; } diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c new file mode 100644 index 0000000000..ad5a1bd7c6 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/flow_estimation/disflow.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +#include "config/aom_dsp_rtcd.h" + +#if DISFLOW_PATCH_SIZE != 8 +#error "Need to change disflow_avx2.c if DISFLOW_PATCH_SIZE != 8" +#endif + +// Compute horizontal and vertical kernels and return them packed into a +// register. The coefficient ordering is: +// h0, h1, v0, v1, h2, h3, v2, v3 +// This is chosen because it takes less work than fully separating the kernels, +// but it is separated enough that we can pick out each coefficient pair in the +// main compute_flow_at_point function +static INLINE __m128i compute_cubic_kernels(double u, double v) { + const __m128d x = _mm_set_pd(v, u); + + const __m128d x2 = _mm_mul_pd(x, x); + const __m128d x3 = _mm_mul_pd(x2, x); + + // Macro to multiply a value v by a constant coefficient c +#define MULC(c, v) _mm_mul_pd(_mm_set1_pd(c), v) + + // Compute floating-point kernel + // Note: To ensure results are bit-identical to the C code, we need to perform + // exactly the same sequence of operations here as in the C code. + __m128d k0 = _mm_sub_pd(_mm_add_pd(MULC(-0.5, x), x2), MULC(0.5, x3)); + __m128d k1 = + _mm_add_pd(_mm_sub_pd(_mm_set1_pd(1.0), MULC(2.5, x2)), MULC(1.5, x3)); + __m128d k2 = + _mm_sub_pd(_mm_add_pd(MULC(0.5, x), MULC(2.0, x2)), MULC(1.5, x3)); + __m128d k3 = _mm_add_pd(MULC(-0.5, x2), MULC(0.5, x3)); +#undef MULC + + // Integerize + __m128d prec = _mm_set1_pd((double)(1 << DISFLOW_INTERP_BITS)); + + k0 = _mm_round_pd(_mm_mul_pd(k0, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k1 = _mm_round_pd(_mm_mul_pd(k1, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k2 = _mm_round_pd(_mm_mul_pd(k2, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k3 = _mm_round_pd(_mm_mul_pd(k3, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + const __m128i c0 = _mm_cvtpd_epi32(k0); + const __m128i c1 = _mm_cvtpd_epi32(k1); + const __m128i c2 = _mm_cvtpd_epi32(k2); + const __m128i c3 = _mm_cvtpd_epi32(k3); + + // Rearrange results and convert down to 16 bits, giving the target output + // ordering + const __m128i c01 = _mm_unpacklo_epi32(c0, c1); + const __m128i c23 = _mm_unpacklo_epi32(c2, c3); + return _mm_packs_epi32(c01, c23); +} + +// Compare two regions of width x height pixels, one rooted at position +// (x, y) in src and the other at (x + u, y + v) in ref. +// This function returns the sum of squared pixel differences between +// the two regions. +// +// TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation +// instad of bicubic interpolation +static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, + int width, int height, int stride, int x, + int y, double u, double v, + const int16_t *dx, const int16_t *dy, + int *b) { + const __m256i zero = _mm256_setzero_si256(); + + // Accumulate 8 32-bit partial sums for each element of b + // These will be flattened at the end. + __m256i b0_acc = _mm256_setzero_si256(); + __m256i b1_acc = _mm256_setzero_si256(); + + // Split offset into integer and fractional parts, and compute cubic + // interpolation kernels + const int u_int = (int)floor(u); + const int v_int = (int)floor(v); + const double u_frac = u - floor(u); + const double v_frac = v - floor(v); + + const __m128i kernels = compute_cubic_kernels(u_frac, v_frac); + + // Storage for intermediate values between the two convolution directions + // In the AVX2 implementation, this needs a dummy row at the end, because + // we generate 2 rows at a time but the total number of rows is odd. + // So we generate one more row than we actually need. + DECLARE_ALIGNED(32, int16_t, + tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 4)]); + int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row + + // Clamp coordinates so that all pixels we fetch will remain within the + // allocated border region, but allow them to go far enough out that + // the border pixels' values do not change. + // Since we are calculating an 8x8 block, the bottom-right pixel + // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic + // interpolation has 4 taps, meaning that the output of pixel + // (x_w, y_w) depends on the pixels in the range + // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). + // + // Thus the most extreme coordinates which will be fetched are + // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). + const int x0 = clamp(x + u_int, -9, width); + const int y0 = clamp(y + v_int, -9, height); + + // Horizontal convolution + + // Prepare the kernel vectors + // We split the kernel into two vectors with kernel indices: + // 0, 1, 0, 1, 0, 1, 0, 1, and + // 2, 3, 2, 3, 2, 3, 2, 3 + __m256i h_kernel_01 = _mm256_broadcastd_epi32(kernels); + __m256i h_kernel_23 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 8)); + + __m256i round_const_h = _mm256_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1)); + + for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; i += 2) { + const int y_w = y0 + i; + const uint8_t *ref_row = &ref[y_w * stride + (x0 - 1)]; + int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; + + // Load this row of pixels. + // For an 8x8 patch, we need to load the 8 image pixels + 3 extras, + // for a total of 11 pixels. Here we load 16 pixels, but only use + // the first 11. + __m256i row = + yy_loadu2_128((__m128i *)(ref_row + stride), (__m128i *)ref_row); + + // Expand pixels to int16s + // We must use unpacks here, as we have one row in each 128-bit lane + // and want to handle each of those independently. + // This is in contrast to _mm256_cvtepu8_epi16(), which takes a single + // 128-bit input and widens it to 256 bits. + __m256i px_0to7_i16 = _mm256_unpacklo_epi8(row, zero); + __m256i px_4to10_i16 = + _mm256_unpacklo_epi8(_mm256_srli_si256(row, 4), zero); + + // Compute first four outputs + // input pixels 0, 1, 1, 2, 2, 3, 3, 4 + // * kernel 0, 1, 0, 1, 0, 1, 0, 1 + __m256i px0 = + _mm256_unpacklo_epi16(px_0to7_i16, _mm256_srli_si256(px_0to7_i16, 2)); + // input pixels 2, 3, 3, 4, 4, 5, 5, 6 + // * kernel 2, 3, 2, 3, 2, 3, 2, 3 + __m256i px1 = _mm256_unpacklo_epi16(_mm256_srli_si256(px_0to7_i16, 4), + _mm256_srli_si256(px_0to7_i16, 6)); + // Convolve with kernel and sum 2x2 boxes to form first 4 outputs + __m256i sum0 = _mm256_add_epi32(_mm256_madd_epi16(px0, h_kernel_01), + _mm256_madd_epi16(px1, h_kernel_23)); + + __m256i out0 = _mm256_srai_epi32(_mm256_add_epi32(sum0, round_const_h), + DISFLOW_INTERP_BITS - 6); + + // Compute second four outputs + __m256i px2 = + _mm256_unpacklo_epi16(px_4to10_i16, _mm256_srli_si256(px_4to10_i16, 2)); + __m256i px3 = _mm256_unpacklo_epi16(_mm256_srli_si256(px_4to10_i16, 4), + _mm256_srli_si256(px_4to10_i16, 6)); + __m256i sum1 = _mm256_add_epi32(_mm256_madd_epi16(px2, h_kernel_01), + _mm256_madd_epi16(px3, h_kernel_23)); + + // Round by just enough bits that the result is + // guaranteed to fit into an i16. Then the next stage can use 16 x 16 -> 32 + // bit multiplies, which should be a fair bit faster than 32 x 32 -> 32 + // as it does now + // This means shifting down so we have 6 extra bits, for a maximum value + // of +18360, which can occur if u_frac == 0.5 and the input pixels are + // {0, 255, 255, 0}. + __m256i out1 = _mm256_srai_epi32(_mm256_add_epi32(sum1, round_const_h), + DISFLOW_INTERP_BITS - 6); + + _mm256_storeu_si256((__m256i *)tmp_row, _mm256_packs_epi32(out0, out1)); + } + + // Vertical convolution + const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; + __m256i round_const_v = _mm256_set1_epi32(1 << (round_bits - 1)); + + __m256i v_kernel_01 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 4)); + __m256i v_kernel_23 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 12)); + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; + + // Load 5 rows of 8 x 16-bit values, and pack into 4 registers + // holding rows {0, 1}, {1, 2}, {2, 3}, {3, 4} + __m128i row0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); + __m128i row1 = _mm_loadu_si128((__m128i *)tmp_row); + __m128i row2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); + __m128i row3 = + _mm_loadu_si128((__m128i *)(tmp_row + 2 * DISFLOW_PATCH_SIZE)); + __m128i row4 = + _mm_loadu_si128((__m128i *)(tmp_row + 3 * DISFLOW_PATCH_SIZE)); + + __m256i px0 = _mm256_set_m128i(row1, row0); + __m256i px1 = _mm256_set_m128i(row2, row1); + __m256i px2 = _mm256_set_m128i(row3, row2); + __m256i px3 = _mm256_set_m128i(row4, row3); + + // We want to calculate px0 * v_kernel[0] + px1 * v_kernel[1] + ... , + // but each multiply expands its output to 32 bits. So we need to be + // a little clever about how we do this + __m256i sum0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_unpacklo_epi16(px0, px1), v_kernel_01), + _mm256_madd_epi16(_mm256_unpacklo_epi16(px2, px3), v_kernel_23)); + __m256i sum1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_unpackhi_epi16(px0, px1), v_kernel_01), + _mm256_madd_epi16(_mm256_unpackhi_epi16(px2, px3), v_kernel_23)); + + __m256i sum0_rounded = + _mm256_srai_epi32(_mm256_add_epi32(sum0, round_const_v), round_bits); + __m256i sum1_rounded = + _mm256_srai_epi32(_mm256_add_epi32(sum1, round_const_v), round_bits); + + __m256i warped = _mm256_packs_epi32(sum0_rounded, sum1_rounded); + __m128i src_pixels_u8 = xx_loadu_2x64(&src[(y + i + 1) * stride + x], + &src[(y + i) * stride + x]); + __m256i src_pixels = + _mm256_slli_epi16(_mm256_cvtepu8_epi16(src_pixels_u8), 3); + + // Calculate delta from the target patch + __m256i dt = _mm256_sub_epi16(warped, src_pixels); + + // Load 2x8 elements each of dx and dt, to pair with the 2x8 elements of dt + // that we have just computed. Then compute 2x8 partial sums of dx * dt + // and dy * dt, implicitly sum to give 2x4 partial sums of each, and + // accumulate. + __m256i dx_row = _mm256_loadu_si256((__m256i *)&dx[i * DISFLOW_PATCH_SIZE]); + __m256i dy_row = _mm256_loadu_si256((__m256i *)&dy[i * DISFLOW_PATCH_SIZE]); + b0_acc = _mm256_add_epi32(b0_acc, _mm256_madd_epi16(dx_row, dt)); + b1_acc = _mm256_add_epi32(b1_acc, _mm256_madd_epi16(dy_row, dt)); + } + + // Flatten the two sets of partial sums to find the final value of b + // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc). + // We need to do 14 additions in total; a `hadd` instruction can take care + // of eight of them, then a vertical sum can do four more, leaving two + // scalar additions. + __m256i partial_sum_256 = _mm256_hadd_epi32(b0_acc, b1_acc); + __m128i partial_sum = + _mm_add_epi32(_mm256_extracti128_si256(partial_sum_256, 0), + _mm256_extracti128_si256(partial_sum_256, 1)); + b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1); + b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3); +} + +// Compute the x and y gradients of the source patch in a single pass, +// and store into dx and dy respectively. +static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx, + int16_t *dy) { + const __m256i zero = _mm256_setzero_si256(); + + // Loop setup: Load the first two rows (of 10 input rows) and apply + // the horizontal parts of the two filters + __m256i row_m1_0 = + yy_loadu2_128((__m128i *)(src - 1), (__m128i *)(src - src_stride - 1)); + __m256i row_m1_0_a = _mm256_unpacklo_epi8(row_m1_0, zero); + __m256i row_m1_0_b = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 1), zero); + __m256i row_m1_0_c = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 2), zero); + + __m256i row_m1_0_hsmooth = + _mm256_add_epi16(_mm256_add_epi16(row_m1_0_a, row_m1_0_c), + _mm256_slli_epi16(row_m1_0_b, 1)); + __m256i row_m1_0_hdiff = _mm256_sub_epi16(row_m1_0_a, row_m1_0_c); + + // Main loop: For each pair of output rows (i, i+1): + // * Load rows (i+1, i+2) and apply both horizontal filters + // * Apply vertical filters and store results + // * Shift rows for next iteration + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + // Load rows (i+1, i+2) and apply both horizontal filters + const __m256i row_p1_p2 = + yy_loadu2_128((__m128i *)(src + (i + 2) * src_stride - 1), + (__m128i *)(src + (i + 1) * src_stride - 1)); + const __m256i row_p1_p2_a = _mm256_unpacklo_epi8(row_p1_p2, zero); + const __m256i row_p1_p2_b = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 1), zero); + const __m256i row_p1_p2_c = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 2), zero); + + const __m256i row_p1_p2_hsmooth = + _mm256_add_epi16(_mm256_add_epi16(row_p1_p2_a, row_p1_p2_c), + _mm256_slli_epi16(row_p1_p2_b, 1)); + const __m256i row_p1_p2_hdiff = _mm256_sub_epi16(row_p1_p2_a, row_p1_p2_c); + + // Apply vertical filters and store results + // dx = vertical smooth(horizontal diff(input)) + // dy = vertical diff(horizontal smooth(input)) + const __m256i row_0_p1_hdiff = + _mm256_permute2x128_si256(row_m1_0_hdiff, row_p1_p2_hdiff, 0x21); + const __m256i dx_row = + _mm256_add_epi16(_mm256_add_epi16(row_m1_0_hdiff, row_p1_p2_hdiff), + _mm256_slli_epi16(row_0_p1_hdiff, 1)); + const __m256i dy_row = + _mm256_sub_epi16(row_m1_0_hsmooth, row_p1_p2_hsmooth); + + _mm256_storeu_si256((__m256i *)(dx + i * DISFLOW_PATCH_SIZE), dx_row); + _mm256_storeu_si256((__m256i *)(dy + i * DISFLOW_PATCH_SIZE), dy_row); + + // Shift rows for next iteration + // This allows a lot of work to be reused, reducing the number of + // horizontal filtering operations from 2*3*8 = 48 to 2*10 = 20 + row_m1_0_hsmooth = row_p1_p2_hsmooth; + row_m1_0_hdiff = row_p1_p2_hdiff; + } +} + +static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + double *M) { + __m256i acc[4] = { 0 }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + __m256i dx_row = _mm256_loadu_si256((__m256i *)&dx[i * dx_stride]); + __m256i dy_row = _mm256_loadu_si256((__m256i *)&dy[i * dy_stride]); + + acc[0] = _mm256_add_epi32(acc[0], _mm256_madd_epi16(dx_row, dx_row)); + acc[1] = _mm256_add_epi32(acc[1], _mm256_madd_epi16(dx_row, dy_row)); + // Don't compute acc[2], as it should be equal to acc[1] + acc[3] = _mm256_add_epi32(acc[3], _mm256_madd_epi16(dy_row, dy_row)); + } + + // Condense sums + __m256i partial_sum_0 = _mm256_hadd_epi32(acc[0], acc[1]); + __m256i partial_sum_1 = _mm256_hadd_epi32(acc[1], acc[3]); + __m256i result_256 = _mm256_hadd_epi32(partial_sum_0, partial_sum_1); + __m128i result = _mm_add_epi32(_mm256_extracti128_si256(result_256, 0), + _mm256_extracti128_si256(result_256, 1)); + + // Apply regularization + // We follow the standard regularization method of adding `k * I` before + // inverting. This ensures that the matrix will be invertible. + // + // Setting the regularization strength k to 1 seems to work well here, as + // typical values coming from the other equations are very large (1e5 to + // 1e6, with an upper limit of around 6e7, at the time of writing). + // It also preserves the property that all matrix values are whole numbers, + // which is convenient for integerized SIMD implementation. + result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1)); + + // Convert results to doubles and store + _mm256_storeu_pd(M, _mm256_cvtepi32_pd(result)); +} + +// Try to invert the matrix M +// Note: Due to the nature of how a least-squares matrix is constructed, all of +// the eigenvalues will be >= 0, and therefore det M >= 0 as well. +// The regularization term `+ k * I` further ensures that det M >= k^2. +// As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1. +// So we don't have to worry about non-invertible matrices here. +static INLINE void invert_2x2(const double *M, double *M_inv) { + double det = (M[0] * M[3]) - (M[1] * M[2]); + assert(det >= 1); + const double det_inv = 1 / det; + + M_inv[0] = M[3] * det_inv; + M_inv[1] = -M[1] * det_inv; + M_inv[2] = -M[2] * det_inv; + M_inv[3] = M[0] * det_inv; +} + +void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, + int x, int y, int width, int height, + int stride, double *u, double *v) { + DECLARE_ALIGNED(32, double, M[4]); + DECLARE_ALIGNED(32, double, M_inv[4]); + DECLARE_ALIGNED(32, int16_t, dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + DECLARE_ALIGNED(32, int16_t, dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + int b[2]; + + // Compute gradients within this patch + const uint8_t *src_patch = &src[y * stride + x]; + sobel_filter(src_patch, stride, dx, dy); + + compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); + invert_2x2(M, M_inv); + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy, + b); + + // Solve flow equations to find a better estimate for the flow vector + // at this point + const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; + const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; + *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); + *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); + + if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { + // Stop iteration when we're close to convergence + break; + } + } +} diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c index 2c5effd638..e0a4bd040c 100644 --- a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c +++ b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c @@ -1,13 +1,12 @@ /* - * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * Copyright (c) 2024, Alliance for Open Media. All rights reserved * - * This source code is subject to the terms of the BSD 3-Clause Clear License - * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear - * License was not distributed with this source code in the LICENSE file, you - * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the - * Alliance for Open Media Patent License 1.0 was not distributed with this - * source code in the PATENTS file, you can obtain it at - * aomedia.org/license/patent-license/. + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include @@ -20,46 +19,59 @@ #include "config/aom_dsp_rtcd.h" -// Internal cross-check against C code -// If you set this to 1 and compile in debug mode, then the outputs of the two -// convolution stages will be checked against the plain C version of the code, -// and an assertion will be fired if the results differ. -#define CHECK_RESULTS 0 - -// Note: Max sum(+ve coefficients) = 1.125 * scale -static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { - // Check that the fractional position is in range. - // - // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. - // Mathematically, this implies that 0 <= x < 1. However, in practice it is - // possible to have x == 1 due to floating point rounding. This is fine, - // and we still interpolate correctly if we allow x = 1. - assert(0 <= x && x <= 1); - - double x2 = x * x; - double x3 = x2 * x; - kernel[0] = -0.5 * x + x2 - 0.5 * x3; - kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; - kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; - kernel[3] = -0.5 * x2 + 0.5 * x3; -} - -static INLINE void get_cubic_kernel_int(double x, int16_t kernel[4]) { - double kernel_dbl[4]; - get_cubic_kernel_dbl(x, kernel_dbl); - - kernel[0] = (int16_t)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); - kernel[1] = (int16_t)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); - kernel[2] = (int16_t)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); - kernel[3] = (int16_t)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); -} - -#if CHECK_RESULTS -static INLINE int get_cubic_value_int(const int *p, const int16_t kernel[4]) { - return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + - kernel[3] * p[3]; +#if DISFLOW_PATCH_SIZE != 8 +#error "Need to change disflow_sse4.c if DISFLOW_PATCH_SIZE != 8" +#endif + +// Compute horizontal and vertical kernels and return them packed into a +// register. The coefficient ordering is: +// h0, h1, v0, v1, h2, h3, v2, v3 +// This is chosen because it takes less work than fully separating the kernels, +// but it is separated enough that we can pick out each coefficient pair in the +// main compute_flow_at_point function +static INLINE __m128i compute_cubic_kernels(double u, double v) { + const __m128d x = _mm_set_pd(v, u); + + const __m128d x2 = _mm_mul_pd(x, x); + const __m128d x3 = _mm_mul_pd(x2, x); + + // Macro to multiply a value v by a constant coefficient c +#define MULC(c, v) _mm_mul_pd(_mm_set1_pd(c), v) + + // Compute floating-point kernel + // Note: To ensure results are bit-identical to the C code, we need to perform + // exactly the same sequence of operations here as in the C code. + __m128d k0 = _mm_sub_pd(_mm_add_pd(MULC(-0.5, x), x2), MULC(0.5, x3)); + __m128d k1 = + _mm_add_pd(_mm_sub_pd(_mm_set1_pd(1.0), MULC(2.5, x2)), MULC(1.5, x3)); + __m128d k2 = + _mm_sub_pd(_mm_add_pd(MULC(0.5, x), MULC(2.0, x2)), MULC(1.5, x3)); + __m128d k3 = _mm_add_pd(MULC(-0.5, x2), MULC(0.5, x3)); +#undef MULC + + // Integerize + __m128d prec = _mm_set1_pd((double)(1 << DISFLOW_INTERP_BITS)); + + k0 = _mm_round_pd(_mm_mul_pd(k0, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k1 = _mm_round_pd(_mm_mul_pd(k1, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k2 = _mm_round_pd(_mm_mul_pd(k2, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k3 = _mm_round_pd(_mm_mul_pd(k3, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + const __m128i c0 = _mm_cvtpd_epi32(k0); + const __m128i c1 = _mm_cvtpd_epi32(k1); + const __m128i c2 = _mm_cvtpd_epi32(k2); + const __m128i c3 = _mm_cvtpd_epi32(k3); + + // Rearrange results and convert down to 16 bits, giving the target output + // ordering + const __m128i c01 = _mm_unpacklo_epi32(c0, c1); + const __m128i c23 = _mm_unpacklo_epi32(c2, c3); + return _mm_packs_epi32(c01, c23); } -#endif // CHECK_RESULTS // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. @@ -80,10 +92,6 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, // These will be flattened at the end. __m128i b0_acc = _mm_setzero_si128(); __m128i b1_acc = _mm_setzero_si128(); -#if CHECK_RESULTS - // Also keep a running sum using the C algorithm, for cross-checking - int c_result[2] = { 0 }; -#endif // CHECK_RESULTS // Split offset into integer and fractional parts, and compute cubic // interpolation kernels @@ -92,13 +100,11 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, const double u_frac = u - floor(u); const double v_frac = v - floor(v); - int16_t h_kernel[4]; - int16_t v_kernel[4]; - get_cubic_kernel_int(u_frac, h_kernel); - get_cubic_kernel_int(v_frac, v_kernel); + const __m128i kernels = compute_cubic_kernels(u_frac, v_frac); // Storage for intermediate values between the two convolution directions - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; + DECLARE_ALIGNED(16, int16_t, + tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]); int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row // Clamp coordinates so that all pixels we fetch will remain within the @@ -121,8 +127,8 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, // We split the kernel into two vectors with kernel indices: // 0, 1, 0, 1, 0, 1, 0, 1, and // 2, 3, 2, 3, 2, 3, 2, 3 - __m128i h_kernel_01 = xx_set2_epi16(h_kernel[0], h_kernel[1]); - __m128i h_kernel_23 = xx_set2_epi16(h_kernel[2], h_kernel[3]); + __m128i h_kernel_01 = _mm_set1_epi32(_mm_extract_epi32(kernels, 0)); + __m128i h_kernel_23 = _mm_set1_epi32(_mm_extract_epi32(kernels, 2)); __m128i round_const_h = _mm_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1)); @@ -141,10 +147,6 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, __m128i px_0to7_i16 = _mm_cvtepu8_epi16(row); __m128i px_4to10_i16 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 4)); - // Relevant multiply instruction - // This multiplies pointwise, then sums in pairs. - //_mm_madd_epi16(); - // Compute first four outputs // input pixels 0, 1, 1, 2, 2, 3, 3, 4 // * kernel 0, 1, 0, 1, 0, 1, 0, 1 @@ -180,43 +182,14 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, DISFLOW_INTERP_BITS - 6); _mm_storeu_si128((__m128i *)tmp_row, _mm_packs_epi32(out0, out1)); - -#if CHECK_RESULTS && !defined(NDEBUG) - // Cross-check - for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { - const int x_w = x0 + j; - int arr[4]; - - arr[0] = (int)ref[y_w * stride + (x_w - 1)]; - arr[1] = (int)ref[y_w * stride + (x_w + 0)]; - arr[2] = (int)ref[y_w * stride + (x_w + 1)]; - arr[3] = (int)ref[y_w * stride + (x_w + 2)]; - - // Apply kernel and round, keeping 6 extra bits of precision. - // - // 6 is the maximum allowable number of extra bits which will avoid - // the intermediate values overflowing an int16_t. The most extreme - // intermediate value occurs when: - // * The input pixels are [0, 255, 255, 0] - // * u_frac = 0.5 - // In this case, the un-scaled output is 255 * 1.125 = 286.875. - // As an integer with 6 fractional bits, that is 18360, which fits - // in an int16_t. But with 7 fractional bits it would be 36720, - // which is too large. - const int c_value = ROUND_POWER_OF_TWO(get_cubic_value_int(arr, h_kernel), - DISFLOW_INTERP_BITS - 6); - (void)c_value; // Suppress warnings - assert(tmp_row[j] == c_value); - } -#endif // CHECK_RESULTS } // Vertical convolution const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; __m128i round_const_v = _mm_set1_epi32(1 << (round_bits - 1)); - __m128i v_kernel_01 = xx_set2_epi16(v_kernel[0], v_kernel[1]); - __m128i v_kernel_23 = xx_set2_epi16(v_kernel[2], v_kernel[3]); + __m128i v_kernel_01 = _mm_set1_epi32(_mm_extract_epi32(kernels, 1)); + __m128i v_kernel_23 = _mm_set1_epi32(_mm_extract_epi32(kernels, 3)); for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; @@ -259,30 +232,6 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * DISFLOW_PATCH_SIZE]); b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt)); b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt)); - -#if CHECK_RESULTS - int16_t dt_arr[8]; - memcpy(dt_arr, &dt, 8 * sizeof(*dt_arr)); - for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { - int16_t *p = &tmp[i * DISFLOW_PATCH_SIZE + j]; - int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE], - p[2 * DISFLOW_PATCH_SIZE] }; - const int result = get_cubic_value_int(arr, v_kernel); - - // Apply kernel and round. - // This time, we have to round off the 6 extra bits which were kept - // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits - // of precision to match the scale of the dx and dy arrays. - const int c_warped = ROUND_POWER_OF_TWO(result, round_bits); - const int c_src_px = src[(x + j) + (y + i) * stride] << 3; - const int c_dt = c_warped - c_src_px; - - assert(dt_arr[j] == c_dt); - - c_result[0] += dx[i * DISFLOW_PATCH_SIZE + j] * c_dt; - c_result[1] += dy[i * DISFLOW_PATCH_SIZE + j] * c_dt; - } -#endif // CHECK_RESULTS } // Flatten the two sets of partial sums to find the final value of b @@ -292,156 +241,66 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc); b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1); b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3); - -#if CHECK_RESULTS - assert(b[0] == c_result[0]); - assert(b[1] == c_result[1]); -#endif // CHECK_RESULTS } -static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; -#if CHECK_RESULTS - const int taps = 3; -#endif // CHECK_RESULTS - - // Horizontal filter - // As the kernel is simply {1, 0, -1}, we implement this as simply - // out[x] = image[x-1] - image[x+1] - // rather than doing a "proper" convolution operation - for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { - const uint8_t *src_row = src + y * src_stride; - int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - - // Load pixels and expand to 16 bits - __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1)); - __m128i px0 = _mm_cvtepu8_epi16(row); - __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); - - __m128i out = _mm_sub_epi16(px0, px2); - - // Store to intermediate array - _mm_storeu_si128((__m128i *)tmp_row, out); - -#if CHECK_RESULTS - // Cross-check - static const int16_t h_kernel[3] = { 1, 0, -1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += h_kernel[k] * src_row[x + k - 1]; - } - (void)sum; - assert(tmp_row[x] == sum); - } -#endif // CHECK_RESULTS - } - - // Vertical filter - // Here the kernel is {1, 2, 1}, which can be implemented - // with simple sums rather than multiplies and adds. - // In order to minimize dependency chains, we evaluate in the order - // (image[y - 1] + image[y + 1]) + (image[y] << 1) - // This way, the first addition and the shift can happen in parallel - for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { - const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - int16_t *dst_row = dst + y * dst_stride; - - __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); - __m128i px1 = _mm_loadu_si128((__m128i *)tmp_row); - __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); - - __m128i out = - _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1)); - - _mm_storeu_si128((__m128i *)dst_row, out); - -#if CHECK_RESULTS - static const int16_t v_kernel[3] = { 1, 2, 1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; - } - (void)sum; - assert(dst_row[x] == sum); - } -#endif // CHECK_RESULTS - } -} - -static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; -#if CHECK_RESULTS - const int taps = 3; -#endif // CHECK_RESULTS - - // Horizontal filter - // Here the kernel is {1, 2, 1}, which can be implemented - // with simple sums rather than multiplies and adds. - // In order to minimize dependency chains, we evaluate in the order - // (image[y - 1] + image[y + 1]) + (image[y] << 1) - // This way, the first addition and the shift can happen in parallel - for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { - const uint8_t *src_row = src + y * src_stride; - int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - - // Load pixels and expand to 16 bits - __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1)); - __m128i px0 = _mm_cvtepu8_epi16(row); - __m128i px1 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1)); - __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); - - __m128i out = - _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1)); - - // Store to intermediate array - _mm_storeu_si128((__m128i *)tmp_row, out); - -#if CHECK_RESULTS - // Cross-check - static const int16_t h_kernel[3] = { 1, 2, 1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += h_kernel[k] * src_row[x + k - 1]; - } - (void)sum; - assert(tmp_row[x] == sum); - } -#endif // CHECK_RESULTS - } - - // Vertical filter - // As the kernel is simply {1, 0, -1}, we implement this as simply - // out[x] = image[x-1] - image[x+1] - // rather than doing a "proper" convolution operation - for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { - const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - int16_t *dst_row = dst + y * dst_stride; - - __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); - __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); - - __m128i out = _mm_sub_epi16(px0, px2); - - _mm_storeu_si128((__m128i *)dst_row, out); - -#if CHECK_RESULTS - static const int16_t v_kernel[3] = { 1, 0, -1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; - } - (void)sum; - assert(dst_row[x] == sum); - } -#endif // CHECK_RESULTS +// Compute the x and y gradients of the source patch in a single pass, +// and store into dx and dy respectively. +static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx, + int16_t *dy) { + // Loop setup: Load the first two rows (of 10 input rows) and apply + // the horizontal parts of the two filters + __m128i row_m1 = _mm_loadu_si128((__m128i *)(src - src_stride - 1)); + __m128i row_m1_a = _mm_cvtepu8_epi16(row_m1); + __m128i row_m1_b = _mm_cvtepu8_epi16(_mm_srli_si128(row_m1, 1)); + __m128i row_m1_c = _mm_cvtepu8_epi16(_mm_srli_si128(row_m1, 2)); + + __m128i row_m1_hsmooth = _mm_add_epi16(_mm_add_epi16(row_m1_a, row_m1_c), + _mm_slli_epi16(row_m1_b, 1)); + __m128i row_m1_hdiff = _mm_sub_epi16(row_m1_a, row_m1_c); + + __m128i row = _mm_loadu_si128((__m128i *)(src - 1)); + __m128i row_a = _mm_cvtepu8_epi16(row); + __m128i row_b = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1)); + __m128i row_c = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); + + __m128i row_hsmooth = + _mm_add_epi16(_mm_add_epi16(row_a, row_c), _mm_slli_epi16(row_b, 1)); + __m128i row_hdiff = _mm_sub_epi16(row_a, row_c); + + // Main loop: For each of the 8 output rows: + // * Load row i+1 and apply both horizontal filters + // * Apply vertical filters and store results + // * Shift rows for next iteration + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + // Load row i+1 and apply both horizontal filters + const __m128i row_p1 = + _mm_loadu_si128((__m128i *)(src + (i + 1) * src_stride - 1)); + const __m128i row_p1_a = _mm_cvtepu8_epi16(row_p1); + const __m128i row_p1_b = _mm_cvtepu8_epi16(_mm_srli_si128(row_p1, 1)); + const __m128i row_p1_c = _mm_cvtepu8_epi16(_mm_srli_si128(row_p1, 2)); + + const __m128i row_p1_hsmooth = _mm_add_epi16( + _mm_add_epi16(row_p1_a, row_p1_c), _mm_slli_epi16(row_p1_b, 1)); + const __m128i row_p1_hdiff = _mm_sub_epi16(row_p1_a, row_p1_c); + + // Apply vertical filters and store results + // dx = vertical smooth(horizontal diff(input)) + // dy = vertical diff(horizontal smooth(input)) + const __m128i dx_row = + _mm_add_epi16(_mm_add_epi16(row_m1_hdiff, row_p1_hdiff), + _mm_slli_epi16(row_hdiff, 1)); + const __m128i dy_row = _mm_sub_epi16(row_m1_hsmooth, row_p1_hsmooth); + + _mm_storeu_si128((__m128i *)(dx + i * DISFLOW_PATCH_SIZE), dx_row); + _mm_storeu_si128((__m128i *)(dy + i * DISFLOW_PATCH_SIZE), dy_row); + + // Shift rows for next iteration + // This allows a lot of work to be reused, reducing the number of + // horizontal filtering operations from 2*3*8 = 48 to 2*10 = 20 + row_m1_hsmooth = row_hsmooth; + row_m1_hdiff = row_hdiff; + row_hsmooth = row_p1_hsmooth; + row_hdiff = row_p1_hdiff; } } @@ -476,30 +335,6 @@ static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, // which is convenient for integerized SIMD implementation. result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1)); -#if CHECK_RESULTS - int tmp[4] = { 0 }; - - for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { - for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) { - tmp[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j]; - tmp[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j]; - // Don't compute tmp[2], as it should be equal to tmp[1] - tmp[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j]; - } - } - - // Apply regularization - tmp[0] += 1; - tmp[3] += 1; - - tmp[2] = tmp[1]; - - assert(tmp[0] == _mm_extract_epi32(result, 0)); - assert(tmp[1] == _mm_extract_epi32(result, 1)); - assert(tmp[2] == _mm_extract_epi32(result, 2)); - assert(tmp[3] == _mm_extract_epi32(result, 3)); -#endif // CHECK_RESULTS - // Convert results to doubles and store _mm_storeu_pd(M, _mm_cvtepi32_pd(result)); _mm_storeu_pd(M + 2, _mm_cvtepi32_pd(_mm_srli_si128(result, 8))); @@ -525,16 +360,15 @@ static INLINE void invert_2x2(const double *M, double *M_inv) { void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v) { - double M[4]; - double M_inv[4]; + DECLARE_ALIGNED(16, double, M[4]); + DECLARE_ALIGNED(16, double, M_inv[4]); + DECLARE_ALIGNED(16, int16_t, dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + DECLARE_ALIGNED(16, int16_t, dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); int b[2]; - int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; - int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; // Compute gradients within this patch const uint8_t *src_patch = &src[y * stride + x]; - sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE); - sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE); + sobel_filter(src_patch, stride, dx, dy); compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); invert_2x2(M, M_inv); -- cgit v1.2.3