1 files changed, 64 insertions, 47 deletions
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index 6429064175..b0d0d0bb78 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -1103,6 +1103,39 @@ static INLINE int wrap_index(int i, int wiener_win) {
   return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i);
 }
 
+// Splits each w[i] into smaller components w1[i] and w2[i] such that
+// w[i] = w1[i] * WIENER_TAP_SCALE_FACTOR + w2[i].
+static INLINE void split_wiener_filter_coefficients(int wiener_win,
+                                                    const int32_t *w,
+                                                    int32_t *w1, int32_t *w2) {
+  for (int i = 0; i < wiener_win; i++) {
+    w1[i] = w[i] / WIENER_TAP_SCALE_FACTOR;
+    w2[i] = w[i] - w1[i] * WIENER_TAP_SCALE_FACTOR;
+    assert(w[i] == w1[i] * WIENER_TAP_SCALE_FACTOR + w2[i]);
+  }
+}
+
+// Calculates x * w / WIENER_TAP_SCALE_FACTOR, where
+// w = w1 * WIENER_TAP_SCALE_FACTOR + w2.
+//
+// The multiplication x * w may overflow, so we multiply x by the components of
+// w (w1 and w2) and combine the multiplication with the division.
+static INLINE int64_t multiply_and_scale(int64_t x, int32_t w1, int32_t w2) {
+  // Let y = x * w / WIENER_TAP_SCALE_FACTOR
+  //       = x * (w1 * WIENER_TAP_SCALE_FACTOR + w2) / WIENER_TAP_SCALE_FACTOR
+  const int64_t y = x * w1 + x * w2 / WIENER_TAP_SCALE_FACTOR;
+  // Double-check the calculation using __int128.
+  // TODO(wtc): Remove after 2024-04-30.
+#if !defined(NDEBUG) && defined(__GNUC__) && defined(__LP64__)
+  const int32_t w = w1 * WIENER_TAP_SCALE_FACTOR + w2;
+  const __int128 z = (__int128)x * w / WIENER_TAP_SCALE_FACTOR;
+  assert(z >= INT64_MIN);
+  assert(z <= INT64_MAX);
+  assert(y == (int64_t)z);
+#endif
+  return y;
+}
+
 // Solve linear equations to find Wiener filter tap values
 // Taps are output scaled by WIENER_FILT_STEP
 static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
@@ -1175,10 +1208,12 @@ static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
 
 // Fix vector b, update vector a
 static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
-                                        int64_t **Hc, int32_t *a, int32_t *b) {
+                                        int64_t **Hc, int32_t *a,
+                                        const int32_t *b) {
   int i, j;
   int64_t S[WIENER_WIN];
   int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  int32_t b1[WIENER_WIN], b2[WIENER_WIN];
   const int wiener_win2 = wiener_win * wiener_win;
   const int wiener_halfwin1 = (wiener_win >> 1) + 1;
   memset(A, 0, sizeof(A));
@@ -1189,16 +1224,7 @@ static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
       A[jj] += Mc[i][j] * b[i] / WIENER_TAP_SCALE_FACTOR;
     }
   }
-
-  // b/274668506: This is the dual branch for the issue in b/272139363. The fix
-  // is similar. See comments in update_b_sep_sym() below.
-  int32_t max_b_l = 0;
-  for (int l = 0; l < wiener_win; ++l) {
-    const int32_t abs_b_l = abs(b[l]);
-    if (abs_b_l > max_b_l) max_b_l = abs_b_l;
-  }
-  const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR;
-  const int scaler = max_b_l < scale_threshold ? 1 : 4;
+  split_wiener_filter_coefficients(wiener_win, b, b1, b2);
 
   for (i = 0; i < wiener_win; i++) {
     for (j = 0; j < wiener_win; j++) {
@@ -1207,10 +1233,17 @@ static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
         const int kk = wrap_index(k, wiener_win);
         for (l = 0; l < wiener_win; ++l) {
           const int ll = wrap_index(l, wiener_win);
-          B[ll * wiener_halfwin1 + kk] +=
-              Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] /
-              (scaler * WIENER_TAP_SCALE_FACTOR) * b[j] /
-              (WIENER_TAP_SCALE_FACTOR / scaler);
+          // Calculate
+          // B[ll * wiener_halfwin1 + kk] +=
+          //    Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] /
+          //    WIENER_TAP_SCALE_FACTOR * b[j] / WIENER_TAP_SCALE_FACTOR;
+          //
+          // The last multiplication may overflow, so we combine the last
+          // multiplication with the last division.
+          const int64_t x = Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] /
+                            WIENER_TAP_SCALE_FACTOR;
+          // b[j] = b1[j] * WIENER_TAP_SCALE_FACTOR + b2[j]
+          B[ll * wiener_halfwin1 + kk] += multiply_and_scale(x, b1[j], b2[j]);
         }
       }
     }
@@ -1246,10 +1279,12 @@ static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
 
 // Fix vector a, update vector b
 static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
-                                        int64_t **Hc, int32_t *a, int32_t *b) {
+                                        int64_t **Hc, const int32_t *a,
+                                        int32_t *b) {
   int i, j;
   int64_t S[WIENER_WIN];
   int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  int32_t a1[WIENER_WIN], a2[WIENER_WIN];
   const int wiener_win2 = wiener_win * wiener_win;
   const int wiener_halfwin1 = (wiener_win >> 1) + 1;
   memset(A, 0, sizeof(A));
@@ -1260,32 +1295,7 @@ static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
       A[ii] += Mc[i][j] * a[j] / WIENER_TAP_SCALE_FACTOR;
     }
   }
-
-  // b/272139363: The computation,
-  //   Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
-  //          WIENER_TAP_SCALE_FACTOR * a[l] / WIENER_TAP_SCALE_FACTOR;
-  // may generate a signed-integer-overflow. Conditionally scale the terms to
-  // avoid a potential overflow.
-  //
-  // Hc contains accumulated correlation statistics and it is desired to leave
-  // as much room as possible for Hc. It was experimentally observed that the
-  // primary issue manifests itself with the second, a[l], multiply. For
-  // max_a_l < WIENER_TAP_SCALE_FACTOR the first multiply with a[k] should not
-  // increase dynamic range and the second multiply should hence be safe.
-  // Thereafter a safe scale_threshold depends on the actual operational range
-  // of Hc. The largest scale_threshold is expected to depend on bit-depth
-  // (av1_compute_stats_highbd_c() scales highbd to 8-bit) and maximum
-  // restoration-unit size (256), leading up to 32-bit positive numbers in Hc.
-  // Noting that the caller, wiener_decompose_sep_sym(), initializes a[...]
-  // to a range smaller than 16 bits, the scale_threshold is set as below for
-  // convenience.
-  int32_t max_a_l = 0;
-  for (int l = 0; l < wiener_win; ++l) {
-    const int32_t abs_a_l = abs(a[l]);
-    if (abs_a_l > max_a_l) max_a_l = abs_a_l;
-  }
-  const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR;
-  const int scaler = max_a_l < scale_threshold ? 1 : 4;
+  split_wiener_filter_coefficients(wiener_win, a, a1, a2);
 
   for (i = 0; i < wiener_win; i++) {
     const int ii = wrap_index(i, wiener_win);
@@ -1294,10 +1304,17 @@ static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
       int k, l;
       for (k = 0; k < wiener_win; ++k) {
         for (l = 0; l < wiener_win; ++l) {
-          B[jj * wiener_halfwin1 + ii] +=
-              Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
-              (scaler * WIENER_TAP_SCALE_FACTOR) * a[l] /
-              (WIENER_TAP_SCALE_FACTOR / scaler);
+          // Calculate
+          // B[jj * wiener_halfwin1 + ii] +=
+          //     Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
+          //     WIENER_TAP_SCALE_FACTOR * a[l] / WIENER_TAP_SCALE_FACTOR;
+          //
+          // The last multiplication may overflow, so we combine the last
+          // multiplication with the last division.
+          const int64_t x = Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
+                            WIENER_TAP_SCALE_FACTOR;
+          // a[l] = a1[l] * WIENER_TAP_SCALE_FACTOR + a2[l]
+          B[jj * wiener_halfwin1 + ii] += multiply_and_scale(x, a1[l], a2[l]);
         }
       }
     }
@@ -2050,7 +2067,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
           &cpi->trial_frame_rst, cm->superres_upscaled_width,
           cm->superres_upscaled_height, seq_params->subsampling_x,
           seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
-          cm->features.byte_alignment, NULL, NULL, NULL, 0, 0))
+          cm->features.byte_alignment, NULL, NULL, NULL, false, 0))
     aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate trial restored frame buffer");